shhh <- suppressPackageStartupMessages # It's a library, so shhh!

shhh(library( mgcv ))
shhh(library(dplyr))
shhh(library(ggplot2))
shhh(library(lme4))
shhh(library(tidymv))
shhh(library(gamlss))
shhh(library(gsubfn))
shhh(library(lmerTest))
shhh(library(tidyverse))
shhh(library(boot))
shhh(library(rsample))
shhh(library(plotrix))
shhh(library(ggrepel))
shhh(library(mgcv))

shhh(library(brms))
shhh(library(bayesplot))
shhh(library(patchwork))
shhh(library(MASS))
shhh(library(tidyr))
shhh(library(extraDistr))
shhh(library(purrr))
# For exercises with Stan code
shhh(library(rstan))
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = FALSE)

library(car)
library(coda)
shhh(library(gridExtra))

theme_set(theme_bw())
options(digits=4)
options(scipen=999)
set.seed(444)
pipe_message = function(.data, status) {message(status); .data}

Read in MoTR Data


rate = 160

file_prefix = "../data/provo_f160/"
fnames = list.files(path=file_prefix)

df = data.frame()
for (f in fnames) {
  temp = read.csv(paste0(file_prefix, "/", f)) %>%
    mutate(subj = str_remove(f, "_reading_measures.csv"))
  df = rbind(df, temp)
}

# Filter out readers who don't answer the comprehension questions correctly
filter_df = df %>%
  group_by(para_nr, subj) %>% summarise(correct = if_else(unique(correctness) == 1, 1, 0)) %>% ungroup() %>%
  drop_na() %>%
  group_by(subj) %>% summarise(p_correct = mean(correct)) %>% ungroup() %>%
  mutate(p_correct = round(p_correct, digits = 2))
`summarise()` has grouped output by 'para_nr'. You can override using the `.groups` argument.
filter_df = filter_df %>% filter(p_correct < 0.8)
filter_list = filter_df$subj
## reader_3:0.70, reader_60:0.79, reader_76:0.72 , reader_256:0.71 , reader_262:0.57 

raw_df = df %>%
  filter(! subj %in% c(filter_list)) %>%
  mutate(word = str_trim(word)) %>%
  mutate(subj = str_remove(subj, "reader_")) %>%
  mutate(subj = as.character(subj)) %>%
  # filter(! subj %in% c("3", "60", "76", "256", "262")) %>% # Explanation for this filtering
  mutate(FPReg = if_else(total_duration == 0, -1, FPReg)) %>% #If the word is skipped we can't say that it wasn't regressed on the first pass. Set to a "NA"
  dplyr::select(expr_id, cond_id, para_nr, word, word_nr, first_duration, total_duration, gaze_duration, go_past_time, FPReg, subj)

length(unique(raw_df$subj))
[1] 46
df %>%
  filter(! subj %in% c(filter_list)) %>%
  filter(FPReg >= 0) %>%
  dplyr::select(FPReg) %>%
  drop_na() %>%
  summarise( m = mean(FPReg))

df %>%
  filter(! subj %in% c(filter_list)) %>%
  dplyr::select(FPFix) %>%
  drop_na() %>%
  summarise( m = mean(FPFix))
NA
NA
# Average across subjects
motr_agg_df = raw_df %>%
  gather(metric, value, 6:10) %>%
    filter(value >= 0) %>% #Removes the "NA" values for FPReg
  
    # ==== Remove skipped words
    # mutate(zero = if_else(metric != "FPReg" & value == 0,T, F)) %>%
    # filter(zero == F) %>%
  
    drop_na() %>%
    group_by(para_nr, word_nr, word, metric) %>% 
      mutate(outlier = if_else(metric != "FPReg" & value > (mean(value) + 3 * sd(value)), T, F)) %>% filter(outlier == F) %>%
      summarise(value = mean(value), nsubj = length(unique(subj))) %>%
  ungroup() %>%
  arrange(para_nr, word_nr) %>%
  rename(text_id = para_nr, word_text_idx = word_nr, motr_value = value)
`summarise()` has grouped output by 'para_nr', 'word_nr', 'word'. You can override using the `.groups` argument.
# View(motr_agg_df)
# write.csv(motr_agg_df, file = "/Users/cui/Desktop/MoTR/pipeline/ancillary_data/motr_agg_df.csv", row.names = FALSE)

Comparison to Provo

# Read in Provo surprisal, frequency and length data
provo_modeling_df = read.csv("../data/provo_stats.csv") %>%
  dplyr::select(text_id, sent_id, trigger_idx, word, freq, surp, len) %>%
  rename(word_idx = trigger_idx)

provo_modeling_df
NA
# Read in Provo eyetracking data

provo_raw_df = read.csv("../data/provo_eyetracking.csv")

# unique(provo_raw_df$Participant_ID)
# length(unique(provo_raw_df$Participant_ID))

provo_eyetracking_df = provo_raw_df %>%
  dplyr::select(Participant_ID, Text_ID, Sentence_Number, Word_In_Sentence_Number, Word, Word_Number, IA_FIRST_FIX_PROGRESSIVE, IA_FIRST_RUN_DWELL_TIME, IA_DWELL_TIME, IA_REGRESSION_PATH_DURATION, IA_REGRESSION_OUT, IA_SKIP) %>%
  rename( #first_duration = IA_FIRST_FIXATION_DURATION,   
          gaze_duration = IA_FIRST_RUN_DWELL_TIME,
          total_duration = IA_DWELL_TIME,
          go_past_time = IA_REGRESSION_PATH_DURATION,
          subj = Participant_ID,
          text_id = Text_ID,
          sent_id = Sentence_Number,
          word_idx = Word_In_Sentence_Number,
          word_text_idx = Word_Number,   # IA_ID?
          word = Word,      # Word?
          FPReg = IA_REGRESSION_OUT,
          skip = IA_SKIP,
          ff_progressive = IA_FIRST_FIX_PROGRESSIVE) %>%
  mutate(first_duration = gaze_duration) %>%
  mutate(gaze_duration = if_else(ff_progressive == 0, 0, as.double(gaze_duration)),
         go_past_time = if_else(ff_progressive == 0, 0, as.double(go_past_time))) %>%
  dplyr::select(-ff_progressive) %>%
  
  mutate(
    gaze_duration = if_else(total_duration == 0, 0, as.double(gaze_duration)),
      go_past_time = if_else(total_duration == 0, 0, as.double(go_past_time)),
      FPReg = if_else(total_duration == 0, -1, as.double(FPReg)),
      first_duration =  if_else(total_duration == 0, 0, as.double(first_duration)),
  ) %>%
  
  # drop_na() %>%     # will drop the whole row with all the metrics
  gather(metric, value, 7:12) %>%
  filter(value >= 0) %>%          # filter skipped word in eye tracking data for FPReg
  # ==== Remove skipped words
  # mutate(zero = if_else(metric != "FPReg" & value == 0,T, F)) %>%
  # filter(zero == F) %>%
  
  # mutate(value = if_else(is.na(value), as.integer(0), as.integer(value))) %>%
  # mutate(value = if_else(metric != "FPReg" & is.na(value), as.integer(0), as.integer(value))) %>%
  drop_na() %>%
  mutate(word = str_trim(word)) %>%
  mutate(subj = str_remove(subj, "Sub")) %>%
  mutate(subj = as.integer(subj)) %>%
    group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    mutate(outlier = if_else(metric != "FPReg" & metric != "skip" & value > (mean(value) + 3 * sd(value) ), T, F)) %>%
    filter(outlier == F) %>%
  ungroup() #%>%

# Aggregate cross-participant data for all subjects
provo_eyetracking_agg_df = provo_eyetracking_df %>%
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    summarise(value = mean(value),
              nsubj = length(unique(subj))) %>%
    ungroup()
`summarise()` has grouped output by 'text_id', 'word_text_idx', 'sent_id', 'word_idx', 'word'. You can override using the `.groups` argument.
provo_raw_df %>%
  dplyr::select(IA_REGRESSION_OUT) %>%
  drop_na() %>%
  summarise( m = mean(IA_REGRESSION_OUT))

provo_raw_df %>%
  dplyr::select(IA_SKIP) %>%
  drop_na() %>%
  summarise( m = mean(IA_SKIP))
NA
NA

# Split the eyetracking data in two by subjects to see how well it correlates with itself
provo_eyetracking_subj1_df_temp = provo_eyetracking_df %>%
  filter(subj <= 42) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    summarise(value = mean(value)) %>%
  ungroup() %>%
  rename(value_1 = value) #%>%
`summarise()` has grouped output by 'text_id', 'word_text_idx', 'sent_id', 'word_idx', 'word'. You can override using the `.groups` argument.
  # dplyr::select(-sent_id, -word_idx)


provo_eyetracking_subj1_df = merge(provo_eyetracking_subj1_df_temp, motr_agg_df, by=c("text_id", "word_text_idx", "metric")) %>%
  arrange(text_id, sent_id, word_idx) %>%
  filter(!(text_id == 13 & word_text_idx >= 20 & word_text_idx <= 52)) %>%
  filter(!(text_id == 3 & word_text_idx >= 46 & word_text_idx <= 57)) %>%
  rename(word = word.y) %>%
  dplyr::select(text_id, word_text_idx, metric, word, value_1)

# View(provo_eyetracking_subj1_df)

provo_eyetracking_subj2_df = provo_eyetracking_df %>%
  filter(subj > 42) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    summarise(value = mean(value)) %>%
  ungroup() %>%
    rename(value_2 = value)%>%
  dplyr::select(-sent_id, -word_idx)
`summarise()` has grouped output by 'text_id', 'word_text_idx', 'sent_id', 'word_idx', 'word'. You can override using the `.groups` argument.
# View(provo_eyetracking_subj2_df)
  
provo_eyetr_grouped_df = merge(provo_eyetracking_subj2_df, provo_eyetracking_subj1_df, by=c("text_id", "word_text_idx", "metric")) %>%
  # filter(word.x == word.y) %>%
  dplyr::select(-word.y) %>%
  group_by(metric) %>%
    mutate(motr_outlier = if_else(metric != "FPReg" & metric != "skip" & value_1 > (mean(value_1) + 3 * sd(value_1) ), T, F)) %>%
    filter(motr_outlier == F) %>%
    mutate(eyetr_outlier = if_else(metric != "FPReg" & metric != "skip" & value_2 > (mean(value_2) + 3 * sd(value_2) ), T, F)) %>%
    filter(eyetr_outlier == F) %>%
  ungroup() %>%
  gather(measure, value, c("value_1", "value_2")) %>%
  dplyr::select(-motr_outlier, -eyetr_outlier)

# View(provo_eyetr_grouped_df)
provo_df = merge(provo_eyetracking_agg_df, provo_modeling_df, by=c("text_id", "sent_id", "word_idx")) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  arrange(text_id, sent_id, word_idx) %>%
  rename(eyetr_value = value) 

provo_df = merge(provo_df, motr_agg_df, by=c("text_id", "word_text_idx", "metric")) %>%
arrange(text_id, sent_id, word_idx) %>%
  # almost all the word.x != word.y is because of normalization problem, so we can keep them, instead, deleting some special cases
filter(!(text_id == 13 & word_text_idx >= 20 & word_text_idx <= 52)) %>%
  filter(!(text_id == 3 & word_text_idx >= 46 & word_text_idx <= 57)) %>%
# filter(word.x == word) #%>%
dplyr::select(-word.x, -word.y) %>%
group_by(metric) %>%
  mutate(motr_outlier = if_else(metric != "FPReg" & motr_value > (mean(motr_value) + 3 * sd(motr_value) ), T, F)) %>%
  filter(motr_outlier == F) %>%
  mutate(eyetr_outlier = if_else(metric != "FPReg" & eyetr_value > (mean(eyetr_value) + 3 * sd(eyetr_value) ), T, F)) %>%
  filter(eyetr_outlier == F) %>%
ungroup() %>%
gather(measure, value, c("eyetr_value", "motr_value")) %>%
dplyr::select(-motr_outlier, -eyetr_outlier)
  
# provo_df

Bayesian – use Stan – motr & eyetr correlation

print("Gaze Duration")
[1] "Gaze Duration"
gd_df = provo_df %>% filter(metric == "gaze_duration") %>% 
  spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value =  pmax(eyetr_value, 1),
         motr_value = pmax(motr_value, 1)
  ) %>%
  mutate(eyetr_value_log = log(eyetr_value),
         motr_value_log = log(motr_value))
print(cor.test(gd_df$eyetr_value, gd_df$motr_value)$estimate)
   cor 
0.7874 
print(cor.test(gd_df$eyetr_value_log, gd_df$motr_value_log)$estimate)
   cor 
0.6055 
# View(gd_df)
gd_df %>% 
  gather(measure, value, 12:15) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

NA
# center data around 0.

gd_temp <- gd_df[c("eyetr_value", "motr_value")] %>%
   # mutate(eyetr_value = eyetr_value - mean(eyetr_value),
   #      motr_value = motr_value - mean(motr_value)) %>%
  data.matrix()

gd_temp_log <- gd_df[c("eyetr_value_log", "motr_value_log")] %>%
 mutate(eyetr_value_log = eyetr_value_log - mean(eyetr_value_log), 
        motr_value_log = motr_value_log - mean(motr_value_log)) %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix gd_temp
plot(gd_temp, pch = 16, col = "blue",
     main = "Not Log-Transformed")
# Plot the second data matrix gd_temp_log
plot(gd_temp_log, pch = 16, col = "red",
     main = "Centered Log-Transformed")

gd_data = list(x=gd_temp, N=nrow(gd_temp))

fit_gd = stan(
  file="stan_models/bivariate_correlation.stan", 
  data=gd_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_gd@stanmodel@dso <- new("cxxdso")
saveRDS(fit_gd, file = paste0("./bayesian_models/bayesian_models_correlation/motr_eyetr_gaze_duration_cor_drop0s.rds"))
print("Go Past Time")
[1] "Go Past Time"
gpt_df = provo_df %>% filter(metric == "go_past_time") %>% 
  spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value =  pmax(eyetr_value, 1),
         motr_value = pmax(motr_value, 1)
  ) %>%
  mutate(eyetr_value_log = log(eyetr_value),
         motr_value_log = log(motr_value))
print(cor.test(gpt_df$eyetr_value, gpt_df$motr_value)$estimate)
   cor 
0.7292 
print(cor.test(gpt_df$eyetr_value_log, gpt_df$motr_value_log)$estimate)
   cor 
0.5889 
gpt_df %>% 
  gather(measure, value, 12:15) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

gpt_temp <- gpt_df[c("eyetr_value", "motr_value")] %>% data.matrix()

gpt_temp_log <- gpt_df[c("eyetr_value_log", "motr_value_log")] %>%
 mutate(eyetr_value_log = eyetr_value_log - mean(eyetr_value_log), 
        motr_value_log = motr_value_log - mean(motr_value_log)) %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))

# Plot the first data matrix gpt_temp
plot(gpt_temp, pch = 16, col = "blue",
     main = "Not Log-Transformed")
# Plot the second data matrix gpt_temp_log
plot(gpt_temp_log, pch = 16, col = "red",
     main = "Centered Log-Transformed")

# -------fit model go past time ----------
gpt_data = list(x=gpt_temp, N=nrow(gpt_temp))
fit_gpt = stan(
  file="stan_models/bivariate_correlation.stan", 
  data=gpt_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_gpt@stanmodel@dso <- new("cxxdso")
saveRDS(fit_gpt, file = paste0("./bayesian_models/bayesian_models_correlation/motr_eyetr_go_past_time_cor_drop0s.rds"))
print("Total Duration")
[1] "Total Duration"
td_df = provo_df %>% filter(metric == "total_duration") %>% 
  spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value =  pmax(eyetr_value, 1),
         motr_value = pmax(motr_value, 1)
  ) %>%
  mutate(eyetr_value_log = log(eyetr_value),
         motr_value_log = log(motr_value))
print(cor.test(td_df$eyetr_value, td_df$motr_value)$estimate)
   cor 
0.7601 
print(cor.test(td_df$eyetr_value_log, td_df$motr_value_log)$estimate)
   cor 
0.6421 
td_df %>% 
  gather(measure, value, 12:15) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

td_temp <- td_df[c("eyetr_value", "motr_value")] %>% data.matrix()

td_temp_log <- td_df[c("eyetr_value_log", "motr_value_log")] %>%
 mutate(eyetr_value_log = eyetr_value_log - mean(eyetr_value_log), 
        motr_value_log = motr_value_log - mean(motr_value_log)) %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))

# Plot the first data matrix td_temp
plot(td_temp, pch = 16, col = "blue",
     main = "Not Log-Transformed")
# Plot the second data matrix td_temp_log
plot(td_temp_log, pch = 16, col = "red",
     main = "Centered Log-Transformed")

# -------fit model total duration ----------
td_data = list(x=td_temp, N=nrow(td_temp))
fit_td = stan(
  file="stan_models/bivariate_correlation.stan", 
  data=td_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_td@stanmodel@dso <- new("cxxdso")
saveRDS(fit_td, file = paste0("./bayesian_models/bayesian_models_correlation/motr_eyetr_total_duration_cor_drop0s.rds"))
print("First Pass Regression Prob.")
[1] "First Pass Regression Prob."
reg_df = provo_df %>% filter(metric == "FPReg") %>% 
  spread(measure, value) %>%
  filter(eyetr_value > 0, motr_value > 0) %>%
  mutate(eyetr_value =  pmax(eyetr_value, 1e-5),
         motr_value = pmax(motr_value, 1e-5))
print(cor.test(reg_df$eyetr_value, reg_df$motr_value)$estimate)
   cor 
0.3225 
# View(reg_df)
reg_df %>% 
  gather(measure, value, 12:13) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

reg_temp <- reg_df[c("eyetr_value", "motr_value")] %>% data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))

# Plot the first data matrix td_temp
plot(reg_temp, pch = 16, col = "blue",
     main = "Not Log-Transformed")

# -------fit model FPReg ----------
reg_data = list(x=reg_temp, N=nrow(reg_temp))
fit_reg = stan(
  file="stan_models/bivariate_normal_reg.stan", 
  data=reg_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_reg@stanmodel@dso <- new("cxxdso")
saveRDS(fit_reg, file = paste0("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor.rds"))
# models with all 0s
fit_gd = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_gaze_duration_cor.rds")
fit_gpt = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_go_past_time_cor.rds")
fit_td = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_total_duration_cor.rds")
fit_reg = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor.rds")

# models for drop 0s
# fit_gd = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_gaze_duration_cor_drop0s.rds")
# fit_gpt = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_go_past_time_cor_drop0s.rds")
# fit_td = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_total_duration_cor_drop0s.rds")
# fit_reg = readRDS("./bayesian_models/bayesian_models_correlation/ranked_motr_eyetr_FPReg_cor.rds")

print('---------------------------- Gaze Duration--------------------------------------------')
[1] "---------------------------- Gaze Duration--------------------------------------------"
print(fit_gd)
Inference for Stan model: bivariate_correlation.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean     sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]           131.55    0.02   1.46    128.66    130.56    131.55    132.54    134.38  5271    1
mu[2]           213.43    0.04   2.74    208.13    211.57    213.40    215.28    218.88  5555    1
sigma[1]         70.09    0.01   1.04     68.09     69.39     70.07     70.78     72.17  5098    1
sigma[2]        130.80    0.03   1.96    127.05    129.45    130.81    132.10    134.65  4929    1
nu               54.25    0.19  15.25     31.07     43.24     51.93     62.89     90.26  6732    1
rho               0.79    0.00   0.01      0.78      0.79      0.79      0.80      0.81  5263    1
cov[1,1]       4913.42    2.05 146.49   4636.86   4814.49   4909.58   5010.11   5207.93  5102    1
cov[1,2]       7255.71    3.72 241.40   6793.44   7092.28   7254.29   7414.55   7739.22  4201    1
cov[2,1]       7255.71    3.72 241.40   6793.44   7092.28   7254.29   7414.55   7739.22  4201    1
cov[2,2]      17113.69    7.32 513.63  16142.03  16758.30  17110.41  17451.39  18131.15  4929    1
x_rand[1]       140.41    0.72  64.84     23.76     93.88    138.20    183.28    273.84  8002    1
x_rand[2]       233.42    1.33 118.97     28.76    145.37    228.59    312.35    485.69  7985    1
attempt           0.07    0.00   0.27      0.00      0.00      0.00      0.00      1.00  8312    1
max_attempts     10.00     NaN   0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -25988.72    0.03   1.70 -25992.74 -25989.64 -25988.40 -25987.47 -25986.37  3832    1

Samples were drawn using NUTS(diag_e) at Fri Aug 11 12:08:31 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- Go Past Time--------------------------------------------')
[1] "---------------------------- Go Past Time--------------------------------------------"
print(fit_gpt)
Inference for Stan model: bivariate_correlation.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean     sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]           154.74    0.03   2.04    150.72    153.38    154.72    156.15    158.66  4427    1
mu[2]           219.83    0.05   3.30    213.32    217.62    219.82    222.12    226.18  4679    1
sigma[1]         86.23    0.02   1.59     83.16     85.17     86.21     87.29     89.36  4481    1
sigma[2]        140.41    0.04   2.59    135.27    138.67    140.40    142.12    145.48  4528    1
nu                7.48    0.01   0.68      6.29      7.01      7.43      7.91      8.94  4992    1
rho               0.77    0.00   0.01      0.75      0.76      0.77      0.77      0.79  5877    1
cov[1,1]       7438.72    4.09 273.59   6915.18   7254.05   7432.86   7620.07   7984.43  4482    1
cov[1,2]       9299.88    5.84 377.30   8577.98   9044.57   9292.09   9554.07  10053.55  4172    1
cov[2,1]       9299.88    5.84 377.30   8577.98   9044.57   9292.09   9554.07  10053.55  4172    1
cov[2,2]      19721.18   10.81 727.65  18298.54  19228.50  19713.43  20197.70  21164.45  4535    1
x_rand[1]       172.47    0.99  87.13     25.82    111.76    164.61    224.46    367.21  7717    1
x_rand[2]       250.79    1.58 138.92     29.39    150.75    237.02    333.51    565.33  7718    1
attempt           0.11    0.00   0.36      0.00      0.00      0.00      0.00      1.00  7990    1
max_attempts     10.00     NaN   0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -27100.13    0.03   1.74 -27104.37 -27101.08 -27099.80 -27098.85 -27097.73  3647    1

Samples were drawn using NUTS(diag_e) at Fri Aug 11 12:20:52 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- Total Duration--------------------------------------------')
[1] "---------------------------- Total Duration--------------------------------------------"
print(fit_td)
Inference for Stan model: bivariate_correlation.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean     sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]           179.24    0.03   1.98    175.47    177.92    179.22    180.55    183.12  4792    1
mu[2]           263.66    0.05   3.62    256.71    261.20    263.64    266.10    270.68  4380    1
sigma[1]         89.32    0.02   1.49     86.43     88.30     89.32     90.31     92.28  4916    1
sigma[2]        163.49    0.04   2.76    158.18    161.59    163.45    165.36    169.00  4968    1
nu               18.63    0.05   3.79     12.93     15.97     18.00     20.65     27.82  5111    1
rho               0.77    0.00   0.01      0.76      0.77      0.78      0.78      0.79  5791    1
cov[1,1]       7980.52    3.80 266.89   7470.94   7797.69   7978.25   8155.66   8515.25  4927    1
cov[1,2]      11320.89    6.21 414.97  10526.78  11036.95  11314.00  11594.89  12161.76  4461    1
cov[2,1]      11320.89    6.21 414.97  10526.78  11036.95  11314.00  11594.89  12161.76  4461    1
cov[2,2]      26736.46   12.80 902.86  25019.85  26112.35  26714.39  27344.03  28561.76  4976    1
x_rand[1]       191.67    0.95  84.90     40.17    132.02    186.55    244.73    373.85  8049    1
x_rand[2]       290.88    1.70 152.51     33.10    178.96    280.69    387.99    624.44  8005    1
attempt           0.08    0.00   0.29      0.00      0.00      0.00      0.00      1.00  7786    1
max_attempts     10.00     NaN   0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -27334.42    0.03   1.75 -27338.56 -27335.34 -27334.10 -27333.14 -27332.05  3439    1

Samples were drawn using NUTS(diag_e) at Fri Aug 11 12:24:09 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- First Pass Regression Prob.--------------------------------------------')
[1] "---------------------------- First Pass Regression Prob.--------------------------------------------"
print(fit_reg)
Inference for Stan model: bivariate_correlation_reg.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

               mean se_mean      sd     2.5%      25%      50%      75%    97.5% n_eff Rhat
alpha[1]       0.85    0.00    0.02     0.81     0.84     0.85     0.86     0.89  5443    1
alpha[2]       0.15    0.00    0.00     0.14     0.14     0.15     0.15     0.15  7702    1
beta[1]        4.64    0.00    0.14     4.37     4.54     4.64     4.73     4.92  5975    1
beta[2]        2.52    0.00    0.12     2.29     2.44     2.52     2.60     2.76  7877    1
L[1,1]         1.00     NaN    0.00     1.00     1.00     1.00     1.00     1.00   NaN  NaN
L[1,2]         0.00     NaN    0.00     0.00     0.00     0.00     0.00     0.00   NaN  NaN
L[2,1]         0.45    0.00    0.38    -0.41     0.18     0.52     0.77     0.95  8489    1
L[2,2]         0.78    0.00    0.21     0.31     0.64     0.85     0.96     1.00  6201    1
mu[1]          2.34    0.00    0.05     2.25     2.31     2.34     2.37     2.44  5468    1
mu[2]          1.16    0.00    0.00     1.15     1.16     1.16     1.16     1.16  7701    1
sigma[1]     104.39    0.19   14.73    78.70    93.94   103.18   113.48   136.81  5984    1
sigma[2]      12.55    0.02    1.50     9.91    11.50    12.44    13.51    15.76  7872    1
rho[1,1]       1.00     NaN    0.00     1.00     1.00     1.00     1.00     1.00   NaN  NaN
rho[1,2]       0.45    0.00    0.38    -0.41     0.18     0.52     0.77     0.95  8489    1
rho[2,1]       0.45    0.00    0.38    -0.41     0.18     0.52     0.77     0.95  8489    1
rho[2,2]       1.00    0.00    0.00     1.00     1.00     1.00     1.00     1.00   592    1
Sigma[1,1] 11114.32   41.44 3195.37  6193.09  8824.35 10646.59 12878.40 18717.61  5946    1
Sigma[1,2]   583.46    5.69  518.49  -527.26   234.91   654.32   971.41  1433.33  8291    1
Sigma[2,1]   583.46    5.69  518.49  -527.26   234.91   654.32   971.41  1433.33  8291    1
Sigma[2,2]   159.88    0.44   38.81    98.15   132.31   154.69   182.47   248.29  7789    1
x_rand[1]      0.16    0.00    0.14     0.00     0.05     0.12     0.23     0.53  7955    1
x_rand[2]      0.06    0.00    0.12     0.00     0.00     0.00     0.05     0.45  7762    1
lp__       14798.20    0.03    1.56 14794.21 14797.42 14798.55 14799.33 14800.22  3508    1

Samples were drawn using NUTS(diag_e) at Sat Jul 22 20:39:27 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
# stan_trace(fit_gd, pars=c("rho", "mu", "sigma", "nu"))
# stan_dens(fit_gd, pars=c("rho", "mu", "sigma", "nu"), separate_chains = TRUE)
# stan_plot(fit_gd, pars=c("rho", "mu", "sigma", "nu"))

# Gaze Duration
stan_trace(fit_gd)
stan_dens(fit_gd, separate_chains = TRUE)
stan_plot(fit_gd)

# Go Past Time
stan_trace(fit_gpt)
stan_dens(fit_gpt, separate_chains = TRUE)
stan_plot(fit_gpt)

# Total Duration
stan_trace(fit_td)
stan_dens(fit_td, separate_chains = TRUE)
stan_plot(fit_td)

# FPReg
stan_trace(fit_reg)
stan_dens(fit_reg, separate_chains = TRUE)
stan_plot(fit_reg)
p1 <- stan_trace(fit_gd, pars = 'rho', inc_warmup = FALSE)
p2 <- stan_dens(fit_gd, pars = 'rho', separate_chains = TRUE)
p3 <- stan_trace(fit_gd, pars = 'mu[1]', inc_warmup = FALSE)
p4 <- stan_dens(fit_gd, pars = 'mu[1]', separate_chains = TRUE)
p5 <- stan_trace(fit_gd, pars = 'mu[2]', inc_warmup = FALSE)
p6 <- stan_dens(fit_gd, pars = 'mu[2]', separate_chains = TRUE)
p7 <- stan_trace(fit_gd, pars = 'sigma[1]', inc_warmup = FALSE)
p8 <- stan_dens(fit_gd, pars = 'sigma[1]', separate_chains = TRUE)
p9 <- stan_trace(fit_gd, pars = 'sigma[2]', inc_warmup = FALSE)
p10 <- stan_dens(fit_gd, pars = 'sigma[2]', separate_chains = TRUE)
p11 <- stan_trace(fit_gd, pars = 'nu', inc_warmup = FALSE)
p12 <- stan_dens(fit_gd, pars = 'nu', separate_chains = TRUE)


# Use grid.arrange() to arrange the plots
# grid.arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, ncol=2, nrow=6)
print('---------------------------- Gaze Duration--------------------------------------------')
[1] "---------------------------- Gaze Duration--------------------------------------------"
rho_gd = as.numeric(extract(fit_gd, "rho")[[1]])
mean = mean(rho_gd)
crI = quantile(rho_gd, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_gd), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.7912
HPD: [0.7764, 0.8057]
crI: [0.776, 0.8054]
print('---------------------------- Go Past Time--------------------------------------------')
[1] "---------------------------- Go Past Time--------------------------------------------"
rho_gpt = as.numeric(extract(fit_gpt, "rho")[[1]])
mean = mean(rho_gpt)
crI = quantile(rho_gpt, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_gpt), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.7678
HPD: [0.7495, 0.7858]
crI: [0.7492, 0.7856]
print('---------------------------- Total Duration--------------------------------------------')
[1] "---------------------------- Total Duration--------------------------------------------"
rho_td = as.numeric(extract(fit_td, "rho")[[1]])
mean = mean(rho_td)
crI = quantile(rho_td, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_td), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.775
HPD: [0.7582, 0.7917]
crI: [0.7579, 0.7916]
print('---------------------------- First Pass Regression --------------------------------------------')
[1] "---------------------------- First Pass Regression --------------------------------------------"
# rho_reg = as.numeric(extract(fit_reg, "rho[1, 2]")[[1]])
rho_reg = as.numeric(extract(fit_reg, "rho")[[1]])
mean = mean(rho_reg)
crI = quantile(rho_reg, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_reg), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]")
Mean: 0.7228
HPD: [-0.1241, 1]
crI: [-0.2814, 1]
print('---------------------------- Gaze Duration--------------------------------------------')
gd_rand <- extract(fit_gd, "x_rand")[[1]]
# x_rand_filtered <- x_rand[apply(x_rand, 1, function(x) all(x > 0)),]
# x_rand_filtered

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 400), ylim=c(0, 700), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(gd_rand[,1], gd_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(gd_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(gd_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(gd_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- Go Past Time--------------------------------------------')
gpt_rand <- extract(fit_gpt, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 1200), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "Go Past Time") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(gpt_rand[,1], gpt_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(gpt_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(gpt_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(gpt_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- Total Duration--------------------------------------------')
td_rand <- extract(fit_td, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 1200), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "Total Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(td_rand[,1], td_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(td_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(td_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(td_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression --------------------------------------------')
reg_rand <- extract(fit_reg, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(reg_rand[,1], reg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(reg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(reg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

model motr eyetr FPReg correlation (eyetr < 0.3)

print("First Pass Regression Prob. all and  < 0.3")
[1] "First Pass Regression Prob. all and  < 0.3"
reg_df_all = provo_df %>% filter(metric == "FPReg") %>% 
  spread(measure, value) %>%
  # filter(eyetr_value > 0, motr_value > 0) %>%
  mutate(eyetr_value =  pmax(eyetr_value, 1e-5),
         motr_value = pmax(motr_value, 1e-5))

reg_df_low_drop0 = provo_df %>% filter(metric == "FPReg") %>% 
  spread(measure, value) %>%
  filter(eyetr_value > 0, motr_value > 0) %>%
  mutate(eyetr_value =  pmax(eyetr_value, 1e-5),
         motr_value = pmax(motr_value, 1e-5)) %>%
  filter(eyetr_value < 0.3)

reg_df_low = provo_df %>% filter(metric == "FPReg") %>% 
  spread(measure, value) %>%
  # filter(eyetr_value > 0, motr_value > 0) %>%
  mutate(eyetr_value =  pmax(eyetr_value, 1e-5),
         motr_value = pmax(motr_value, 1e-5)) %>%
  filter(eyetr_value < 0.3)
  # mutate(eyetr_value = exp(eyetr_value),
         # motr_value = exp(motr_value)
         # )
# View(reg_df)

print(cor.test(reg_df_all$eyetr_value, reg_df_all$motr_value)$estimate)
   cor 
0.2454 
print(cor.test(reg_df_all$eyetr_value, reg_df_all$motr_value)$p.value)
[1] 0.000000000000000000000000000000000004731
print(cor.test(reg_df_low$eyetr_value, reg_df_low$motr_value)$estimate)
    cor 
0.09488 
print(cor.test(reg_df_low$eyetr_value, reg_df_low$motr_value)$p.value)
[1] 0.000006321
print(cor.test(reg_df_low_drop0$eyetr_value, reg_df_low_drop0$motr_value)$estimate)
    cor 
0.05585 
print(cor.test(reg_df_low_drop0$eyetr_value, reg_df_low_drop0$motr_value)$p.value)
[1] 0.1193
# View(reg_df)
reg_df_low %>% 
  gather(measure, value, 12:13) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

reg_temp_all <- reg_df_all[c("eyetr_value", "motr_value")] %>% data.matrix()
reg_temp_low <- reg_df_low[c("eyetr_value", "motr_value")] %>% data.matrix()
reg_temp_low_drop0 <- reg_df_low_drop0[c("eyetr_value", "motr_value")] %>% data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))

# Plot the first data matrix td_temp
plot(reg_temp_low, pch = 16, col = "blue",
     main = "Not Log-Transformed")

# -------fit model FPReg < 0.3 ----------
reg_data = list(x=reg_temp_all, N=nrow(reg_temp_all))
fit_reg = stan(
  # file="stan_models/bivariate_beta_correlation_reg.stan", 
  file = "stan_models/bivariate_normal_reg.stan",
  data=reg_data, 
  iter=4000, 
  chains=4, 
  cores=4,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_reg@stanmodel@dso <- new("cxxdso")
saveRDS(fit_reg, file = paste0("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_all_data_drop0s.rds"))

model motr eyetr FPReg correlation (eyetr >= 0.3)

print("First Pass Regression Prob. >= 0.3")
[1] "First Pass Regression Prob. >= 0.3"
reg_df_high_drop0 = provo_df %>% filter(metric == "FPReg") %>% 
  spread(measure, value) %>%
  filter(eyetr_value > 0, motr_value > 0) %>%
  mutate(eyetr_value =  pmax(eyetr_value, 1e-5),
         motr_value = pmax(motr_value, 1e-5)) %>%
  filter(eyetr_value >= 0.3)

reg_df_high = provo_df %>% filter(metric == "FPReg") %>% 
  spread(measure, value) %>%
  # filter(eyetr_value > 0, motr_value > 0) %>%
  mutate(eyetr_value =  pmax(eyetr_value, 1e-5),
         motr_value = pmax(motr_value, 1e-5)) %>%
  filter(eyetr_value >= 0.3)
  # mutate(eyetr_value = exp(eyetr_value),
         # motr_value = exp(motr_value)
         # )
# View(reg_df)

print(cor.test(reg_df_high$eyetr_value, reg_df_high$motr_value)$estimate)
   cor 
0.3952 
print(cor.test(reg_df_high$eyetr_value, reg_df_high$motr_value)$p.value)
[1] 0.000000000009391
print(cor.test(reg_df_high_drop0$eyetr_value, reg_df_high_drop0$motr_value)$estimate)
  cor 
0.489 
print(cor.test(reg_df_high_drop0$eyetr_value, reg_df_high_drop0$motr_value)$p.value)
[1] 0.000000001007
# View(reg_df)
reg_df_high %>% 
  gather(measure, value, 12:13) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

reg_temp_high <- reg_df_high[c("eyetr_value", "motr_value")] %>% data.matrix()
reg_temp_high_drop0 <- reg_df_high_drop0[c("eyetr_value", "motr_value")] %>% data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 3))

# Plot the first data matrix td_temp
plot(reg_temp_all, pch = 16, col = "blue",
     main = "FPReg not logged all data")
plot(reg_temp_low, pch = 16, col = "blue",
     main = "FPReg not logged eyetr < 0.3 ")
plot(reg_temp_high, pch = 16, col = "blue",
     main = "FPReg not logged eyetr >= 0.3")

# -------fit model FPReg >= 0.3 ----------
reg_data = list(x=reg_temp_high_drop0, N=nrow(reg_temp_high_drop0))
fit_reg = stan(
  # file="stan_models/bivariate_beta_correlation_reg.stan", 
  file = "stan_models/bivariate_normal_reg.stan",
  data=reg_data, 
  iter=4000, 
  chains=4, 
  cores=4,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_reg@stanmodel@dso <- new("cxxdso")
saveRDS(fit_reg, file = paste0(".//bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_03-1_drop0s.rds"))
fit_mreg_all = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_all_data.rds")
fit_mreg_all_drop0 = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_all_data_drop0s.rds")
fit_mreg_low = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_00-03.rds")
fit_mreg_low_drop0 = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_00-03_drop0s.rds")
fit_mreg_high = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_03-1.rds")
fit_mreg_high_drop0 = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_03-1_drop0s.rds")

print('---------------------------- First Pass Regression Prob. all data --------------------------------------------')
[1] "---------------------------- First Pass Regression Prob. all data --------------------------------------------"
print(fit_mreg_all)
Inference for Stan model: bivariate_normal_reg.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                mean se_mean   sd    2.5%     25%     50%     75%   97.5% n_eff Rhat
mu[1]           0.12    0.00 0.00    0.12    0.12    0.12    0.13    0.13  5811    1
mu[2]           0.03    0.00 0.00    0.02    0.03    0.03    0.03    0.03  3384    1
sigma[1]        0.08    0.00 0.00    0.07    0.07    0.08    0.08    0.08  3694    1
sigma[2]        0.05    0.00 0.00    0.05    0.05    0.05    0.05    0.06  3040    1
nu              2.42    0.00 0.14    2.16    2.33    2.42    2.52    2.71  3250    1
rho             0.16    0.00 0.02    0.11    0.14    0.16    0.17    0.20  8176    1
cov[1,1]        0.01    0.00 0.00    0.01    0.01    0.01    0.01    0.01  3693    1
cov[1,2]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  6348    1
cov[2,1]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  6348    1
cov[2,2]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  3053    1
x_rand[1]       0.16    0.00 0.12    0.02    0.09    0.14    0.20    0.40  7811    1
x_rand[2]       0.07    0.00 0.08    0.00    0.03    0.05    0.09    0.25  7966    1
attempt         0.63    0.01 1.03    0.00    0.00    0.00    1.00    3.00  7919    1
max_attempts   10.00     NaN 0.00   10.00   10.00   10.00   10.00   10.00   NaN  NaN
lp__         7450.58    0.03 1.78 7446.22 7449.65 7450.92 7451.89 7452.98  3519    1

Samples were drawn using NUTS(diag_e) at Sat Aug  5 23:08:09 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- First Pass Regression Prob. all data no 0s--------------------------------------------')
[1] "---------------------------- First Pass Regression Prob. all data no 0s--------------------------------------------"
print(fit_mreg_all_drop0)
Inference for Stan model: bivariate_normal_reg.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                mean se_mean   sd    2.5%     25%     50%     75%   97.5% n_eff Rhat
mu[1]           0.15    0.00 0.00    0.14    0.15    0.15    0.15    0.16  8507    1
mu[2]           0.14    0.00 0.00    0.13    0.14    0.14    0.14    0.14  7923    1
sigma[1]        0.09    0.00 0.00    0.08    0.08    0.09    0.09    0.09  7186    1
sigma[2]        0.06    0.00 0.00    0.06    0.06    0.06    0.06    0.06  6227    1
nu              2.78    0.00 0.23    2.37    2.62    2.77    2.93    3.26  7008    1
rho             0.18    0.00 0.04    0.11    0.16    0.18    0.21    0.26  9833    1
cov[1,1]        0.01    0.00 0.00    0.01    0.01    0.01    0.01    0.01  7173    1
cov[1,2]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  8599    1
cov[2,1]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  8599    1
cov[2,2]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  6234    1
x_rand[1]       0.18    0.00 0.13    0.02    0.10    0.16    0.22    0.43  7699    1
x_rand[2]       0.15    0.00 0.08    0.03    0.10    0.14    0.18    0.34  8175    1
attempt         0.16    0.00 0.43    0.00    0.00    0.00    0.00    1.00  7917    1
max_attempts   10.00     NaN 0.00   10.00   10.00   10.00   10.00   10.00   NaN  NaN
lp__         2566.44    0.03 1.75 2562.18 2565.53 2566.76 2567.72 2568.88  3487    1

Samples were drawn using NUTS(diag_e) at Sat Aug  5 23:18:11 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- First Pass Regression Prob.< 0.3--------------------------------------------')
[1] "---------------------------- First Pass Regression Prob.< 0.3--------------------------------------------"
print(fit_mreg_low)
Inference for Stan model: bivariate_normal_reg.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                mean se_mean   sd    2.5%     25%     50%     75%   97.5% n_eff Rhat
mu[1]           0.12    0.00 0.00    0.11    0.12    0.12    0.12    0.12  8566    1
mu[2]           0.03    0.00 0.00    0.03    0.03    0.03    0.04    0.04  4981    1
sigma[1]        0.06    0.00 0.00    0.06    0.06    0.06    0.06    0.07  7280    1
sigma[2]        0.06    0.00 0.00    0.06    0.06    0.06    0.06    0.07  4552    1
nu              5.54    0.01 0.51    4.62    5.19    5.51    5.86    6.62  4637    1
rho             0.10    0.00 0.02    0.06    0.09    0.10    0.12    0.15  8596    1
cov[1,1]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  7299    1
cov[1,2]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  8259    1
cov[2,1]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  8259    1
cov[2,2]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  4571    1
x_rand[1]       0.13    0.00 0.07    0.02    0.08    0.12    0.17    0.28  8277    1
x_rand[2]       0.07    0.00 0.06    0.00    0.03    0.06    0.10    0.21  7280    1
attempt         0.53    0.01 0.90    0.00    0.00    0.00    1.00    3.00  8006    1
max_attempts   10.00     NaN 0.00   10.00   10.00   10.00   10.00   10.00   NaN  NaN
lp__         7794.82    0.03 1.78 7790.50 7793.88 7795.16 7796.13 7797.26  3232    1

Samples were drawn using NUTS(diag_e) at Sat Aug  5 20:52:03 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- First Pass Regression Prob.< 0.3 no 0s--------------------------------------------')
[1] "---------------------------- First Pass Regression Prob.< 0.3 no 0s--------------------------------------------"
print(fit_mreg_low_drop0)
Inference for Stan model: bivariate_normal_reg.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                mean se_mean   sd    2.5%     25%     50%     75%   97.5% n_eff Rhat
mu[1]           0.13    0.00 0.00    0.13    0.13    0.13    0.13    0.14 10534    1
mu[2]           0.13    0.00 0.00    0.13    0.13    0.13    0.14    0.14  8839    1
sigma[1]        0.06    0.00 0.00    0.06    0.06    0.06    0.06    0.07  8534    1
sigma[2]        0.06    0.00 0.00    0.06    0.06    0.06    0.07    0.07  7122    1
nu              6.50    0.01 0.96    4.90    5.83    6.40    7.06    8.73  6790    1
rho             0.07    0.00 0.04    0.00    0.05    0.07    0.10    0.15 10356    1
cov[1,1]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  8526    1
cov[1,2]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00 10435    1
cov[2,1]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00 10435    1
cov[2,2]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  7123    1
x_rand[1]       0.14    0.00 0.07    0.02    0.09    0.13    0.18    0.28  8031    1
x_rand[2]       0.14    0.00 0.07    0.02    0.10    0.14    0.18    0.29  8006    1
attempt         0.08    0.00 0.30    0.00    0.00    0.00    0.00    1.00  7642    1
max_attempts   10.00     NaN 0.00   10.00   10.00   10.00   10.00   10.00   NaN  NaN
lp__         2737.21    0.03 1.76 2732.81 2736.27 2737.56 2738.50 2739.62  3992    1

Samples were drawn using NUTS(diag_e) at Sat Aug  5 20:30:02 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- First Pass Regression Prob.>= 0.3--------------------------------------------')
[1] "---------------------------- First Pass Regression Prob.>= 0.3--------------------------------------------"
print(fit_mreg_high)
Inference for Stan model: bivariate_normal_reg.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

               mean se_mean    sd   2.5%    25%    50%    75%  97.5% n_eff Rhat
mu[1]          0.45    0.00  0.01   0.44   0.45   0.45   0.46   0.47  5045    1
mu[2]          0.15    0.00  0.01   0.13   0.15   0.15   0.16   0.18  4628    1
sigma[1]       0.11    0.00  0.01   0.10   0.11   0.11   0.12   0.12  5622    1
sigma[2]       0.15    0.00  0.01   0.13   0.15   0.15   0.16   0.17  4499    1
nu            25.71    0.18 12.83   9.59  16.53  22.89  31.70  58.03  5109    1
rho            0.41    0.00  0.06   0.30   0.37   0.41   0.45   0.51  7137    1
cov[1,1]       0.01    0.00  0.00   0.01   0.01   0.01   0.01   0.02  5625    1
cov[1,2]       0.01    0.00  0.00   0.00   0.01   0.01   0.01   0.01  4842    1
cov[2,1]       0.01    0.00  0.00   0.00   0.01   0.01   0.01   0.01  4842    1
cov[2,2]       0.02    0.00  0.00   0.02   0.02   0.02   0.03   0.03  4553    1
x_rand[1]      0.47    0.00  0.11   0.25   0.39   0.47   0.54   0.70  7964    1
x_rand[2]      0.20    0.00  0.13   0.01   0.10   0.19   0.28   0.49  7829    1
attempt        0.19    0.01  0.47   0.00   0.00   0.00   0.00   1.00  7881    1
max_attempts  10.00     NaN  0.00  10.00  10.00  10.00  10.00  10.00   NaN  NaN
lp__         579.26    0.03  1.75 575.10 578.35 579.56 580.56 581.68  3529    1

Samples were drawn using NUTS(diag_e) at Sat Aug  5 22:25:38 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- First Pass Regression Prob.>= 0.3 no 0s--------------------------------------------')
[1] "---------------------------- First Pass Regression Prob.>= 0.3 no 0s--------------------------------------------"
print(fit_mreg_high_drop0)
Inference for Stan model: bivariate_normal_reg.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

               mean se_mean    sd   2.5%    25%    50%    75%  97.5% n_eff Rhat
mu[1]          0.48    0.00  0.01   0.46   0.48   0.48   0.49   0.51  6409    1
mu[2]          0.27    0.00  0.01   0.25   0.26   0.27   0.28   0.30  6397    1
sigma[1]       0.12    0.00  0.01   0.11   0.12   0.12   0.13   0.14  6555    1
sigma[2]       0.16    0.00  0.01   0.14   0.15   0.16   0.17   0.18  6281    1
nu            32.85    0.16 15.27  11.84  21.65  30.12  40.80  69.54  8953    1
rho            0.51    0.00  0.07   0.37   0.47   0.52   0.56   0.64  6986    1
cov[1,1]       0.02    0.00  0.00   0.01   0.01   0.02   0.02   0.02  6491    1
cov[1,2]       0.01    0.00  0.00   0.01   0.01   0.01   0.01   0.02  5145    1
cov[2,1]       0.01    0.00  0.00   0.01   0.01   0.01   0.01   0.02  5145    1
cov[2,2]       0.03    0.00  0.00   0.02   0.02   0.03   0.03   0.03  6242    1
x_rand[1]      0.49    0.00  0.13   0.25   0.41   0.49   0.57   0.74  8129    1
x_rand[2]      0.29    0.00  0.15   0.04   0.18   0.28   0.38   0.60  7467    1
attempt        0.06    0.00  0.24   0.00   0.00   0.00   0.00   1.00  7913    1
max_attempts  10.00     NaN  0.00  10.00  10.00  10.00  10.00  10.00   NaN  NaN
lp__         300.28    0.03  1.75 296.05 299.35 300.60 301.58 302.70  3555    1

Samples were drawn using NUTS(diag_e) at Sat Aug  5 22:31:08 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
# # FPReg all data
# stan_trace(fit_mreg_all)
# stan_dens(fit_mreg_all, separate_chains = TRUE)
# stan_plot(fit_mreg_all)

# stan_trace(fit_mreg_all_drop0)
# stan_dens(fit_mreg_all_drop0, separate_chains = TRUE)
# stan_plot(fit_mreg_all_drop0)

# # FPReg < 0.3
# stan_trace(fit_mreg_low)
# stan_dens(fit_mreg_low, separate_chains = TRUE)
# stan_plot(fit_mreg_low)
# 
# stan_trace(fit_mreg_low_drop0)
# stan_dens(fit_mreg_low_drop0, separate_chains = TRUE)
# stan_plot(fit_mreg_low_drop0)

# FPReg > 0.3
stan_trace(fit_mreg_high)
'pars' not specified. Showing first 10 parameters by default.

stan_dens(fit_mreg_high, separate_chains = TRUE)
'pars' not specified. Showing first 10 parameters by default.

stan_plot(fit_mreg_high)
'pars' not specified. Showing first 10 parameters by default.
ci_level: 0.8 (80% intervals)
outer_level: 0.95 (95% intervals)

stan_trace(fit_mreg_high_drop0)
'pars' not specified. Showing first 10 parameters by default.

stan_dens(fit_mreg_high_drop0, separate_chains = TRUE)
'pars' not specified. Showing first 10 parameters by default.

stan_plot(fit_mreg_high_drop0)
'pars' not specified. Showing first 10 parameters by default.
ci_level: 0.8 (80% intervals)
outer_level: 0.95 (95% intervals)

print('---------------------------- First Pass Regression all data--------------------------------------------')
[1] "---------------------------- First Pass Regression all data--------------------------------------------"
rho_mreg_all = as.numeric(extract(fit_mreg_all, "rho")[[1]])
mean = mean(rho_mreg_all)
crI = quantile(rho_mreg_all, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mreg_all), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.1574
HPD: [0.1139, 0.2017]
crI: [0.1134, 0.2014]
print('---------------------------- First Pass Regression all data no 0s--------------------------------------------')
[1] "---------------------------- First Pass Regression all data no 0s--------------------------------------------"
rho_mreg_all_drop0 = as.numeric(extract(fit_mreg_all_drop0, "rho")[[1]])
mean = mean(rho_mreg_all_drop0)
crI = quantile(rho_mreg_all_drop0, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mreg_all_drop0), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.1843
HPD: [0.1106, 0.2573]
crI: [0.1114, 0.2581]
print('---------------------------- First Pass Regression < 0.3--------------------------------------------')
[1] "---------------------------- First Pass Regression < 0.3--------------------------------------------"
rho_mreg_low = as.numeric(extract(fit_mreg_low, "rho")[[1]])
mean = mean(rho_mreg_low)
crI = quantile(rho_mreg_low, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mreg_low), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.103
HPD: [0.05775, 0.1468]
crI: [0.05805, 0.1475]
print('---------------------------- First Pass Regression < 0.3 no 0s--------------------------------------------')
[1] "---------------------------- First Pass Regression < 0.3 no 0s--------------------------------------------"
rho_mreg_low_drop0 = as.numeric(extract(fit_mreg_low_drop0, "rho")[[1]])
mean = mean(rho_mreg_low_drop0)
crI = quantile(rho_mreg_low_drop0, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mreg_low_drop0), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.07362
HPD: [-0.000616, 0.1514]
crI: [-0.003759, 0.1491]
print('---------------------------- First Pass Regression >= 0.3--------------------------------------------')
[1] "---------------------------- First Pass Regression >= 0.3--------------------------------------------"
rho_mreg_high = as.numeric(extract(fit_mreg_high, "rho")[[1]])
mean = mean(rho_mreg_high)
crI = quantile(rho_mreg_high, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mreg_high), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.4102
HPD: [0.3001, 0.5155]
crI: [0.2992, 0.5146]
print('---------------------------- First Pass Regression >= 0.3 no 0s--------------------------------------------')
[1] "---------------------------- First Pass Regression >= 0.3 no 0s--------------------------------------------"
rho_mreg_high_drop0 = as.numeric(extract(fit_mreg_high_drop0, "rho")[[1]])
mean = mean(rho_mreg_high_drop0)
crI = quantile(rho_mreg_high_drop0, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mreg_high_drop0), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.514
HPD: [0.3797, 0.6456]
crI: [0.3728, 0.6412]
print('---------------------------- First Pass Regression all data --------------------------------------------')
mallreg_rand <- extract(fit_mreg_all, "x_rand")[[1]]
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mallreg_rand[,1], mallreg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp_all, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mallreg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mallreg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression all data no 0s--------------------------------------------')
mallreg_rand_drop0 <- extract(fit_mreg_all_drop0, "x_rand")[[1]]
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mallreg_rand_drop0[,1], mallreg_rand_drop0[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp_all, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mallreg_rand_drop0, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mallreg_rand_drop0, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression < 0.3 --------------------------------------------')
mlowreg_rand <- extract(fit_mreg_low, "x_rand")[[1]]
print(mlowreg_rand)
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mlowreg_rand[,1], mlowreg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp_low, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mlowreg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mlowreg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression < 0.3 no 0s --------------------------------------------')
mlowreg_rand_drop0 <- extract(fit_mreg_low_drop0, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mlowreg_rand_drop0[,1], mlowreg_rand_drop0[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp_low_drop0, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mlowreg_rand_drop0, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mlowreg_rand_drop0, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression >= 0.3 --------------------------------------------')
mhighreg_rand_samples <- extract(fit_mreg_high, "x_rand")[[1]]
# print(mhighreg_rand_samples)
selected_indices <- sample(1:nrow(mhighreg_rand_samples), 900)
mhighreg_rand <- mhighreg_rand_samples[selected_indices, ]
# mhighreg_rand <- extract(fit_mreg_high, "x_rand")[[1]]
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mhighreg_rand[,1], mhighreg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp_high, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mhighreg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mhighreg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

# print('---------------------------- First Pass Regression >= 0.3 no 0s --------------------------------------------')
mhighreg_rand_drop0_samples <- extract(fit_mreg_high_drop0, "x_rand")[[1]]
selected_indices <- sample(1:nrow(mhighreg_rand_drop0_samples), 900)
mhighreg_rand_drop0 <- mhighreg_rand_drop0_samples[selected_indices, ]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color
points(mhighreg_rand_drop0[,1], mhighreg_rand_drop0[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp_high_drop0, pch=16, col="red")

# add dataEllipse with color
dataEllipse(mhighreg_rand_drop0, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mhighreg_rand_drop0, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

model eye tracking to eye tracking correlation


print("Gaze Duration")
[1] "Gaze Duration"
# View(provo_eyetr_grouped_df)

egd_df = provo_eyetr_grouped_df %>% filter(metric == "gaze_duration") %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1),
         eyetr_value_2 = pmax(value_2, 1)
  ) 
print(cor.test(egd_df$eyetr_value_1, egd_df$eyetr_value_2)$estimate)
   cor 
0.9146 
# View(egd_df)

egd_df %>% 
  gather(measure, value, 5:6) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

egd_temp <- egd_df[c("eyetr_value_1", "eyetr_value_2")] %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))

# Plot the first data matrix gd_temp
plot(egd_temp, pch = 16, col = "blue",
     main = "Not Log-Transformed")

egd_data = list(x=egd_temp, N=nrow(egd_temp))

fit_egd = stan(
  file="stan_models/bivariate_correlation.stan", 
  data=egd_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_egd@stanmodel@dso <- new("cxxdso")
saveRDS(fit_egd, file = paste0("./bayesian_models/bayesian_models_correlation/cor_bivariate/eyetr_eyetr_gaze_duration_cor.rds"))
print("Go Past Time")
[1] "Go Past Time"
egpt_df = provo_eyetr_grouped_df %>% filter(metric == "go_past_time") %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1),
         eyetr_value_2 = pmax(value_2, 1)
  ) 
print(cor.test(egpt_df$eyetr_value_1, egpt_df$eyetr_value_2)$estimate)
   cor 
0.9134 
# View(egd_df)

egpt_df %>% 
  gather(measure, value, 5:6) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

egpt_temp <- egpt_df[c("eyetr_value_1", "eyetr_value_2")] %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))

# Plot the first data matrix gd_temp
plot(egpt_temp, pch = 16, col = "blue",
     main = "Not Log-Transformed")

egpt_data = list(x=egpt_temp, N=nrow(egpt_temp))

fit_egpt = stan(
  file="stan_models/bivariate_correlation.stan", 
  data=egpt_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_egpt@stanmodel@dso <- new("cxxdso")
saveRDS(fit_egpt, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_go_past_time_cor.rds"))
print("Total Duration")
[1] "Total Duration"
etd_df = provo_eyetr_grouped_df %>% filter(metric == "total_duration") %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1),
         eyetr_value_2 = pmax(value_2, 1)
  ) 
print(cor.test(etd_df$eyetr_value_1, etd_df$eyetr_value_2)$estimate)
   cor 
0.9272 
# View(egd_df)

etd_df %>% 
  gather(measure, value, 5:6) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

etd_temp <- etd_df[c("eyetr_value_1", "eyetr_value_2")] %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))

# Plot the first data matrix gd_temp
plot(etd_temp, pch = 16, col = "blue",
     main = "Total Duration Not Log-Transformed")

etd_data = list(x=etd_temp, N=nrow(etd_temp))

fit_etd = stan(
  file="stan_models/bivariate_correlation.stan", 
  data=etd_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_etd@stanmodel@dso <- new("cxxdso")
saveRDS(fit_etd, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_total_duration_cor.rds"))
print("Fisrt Pass Regression Prob.")
[1] "Fisrt Pass Regression Prob."
ereg_df = provo_eyetr_grouped_df %>% filter(metric == "FPReg") %>% distinct() %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value)) %>%
  filter(!(row_number() %in% c(443, 444, 445, 446))) %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1e-5),
         eyetr_value_2 = pmax(value_2, 1e-5)
  )
print(cor.test(ereg_df$eyetr_value_1, ereg_df$eyetr_value_2)$estimate)
  cor 
0.741 
# View(egd_df)

ereg_df %>% 
  gather(measure, value, 5:6) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

ereg_temp <- ereg_df[c("eyetr_value_1", "eyetr_value_2")] %>%
  drop_na() %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))

# Plot the first data matrix gd_temp
plot(ereg_temp, pch = 16, col = "blue",
     main = "FPReg Not Log-Transformed")

# -------fit model FPReg ----------

# View(ereg_temp)
ereg_data = list(x=ereg_temp, N=nrow(ereg_temp))
fit_ereg = stan(
  file="stan_models/bivariate_normal_reg.stan", 
  data=ereg_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99),
  verbose = FALSE
  )

# Save the model 
fit_ereg@stanmodel@dso <- new("cxxdso")
saveRDS(fit_ereg, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_FPReg_cor5.rds"))
fit_egd = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_gaze_duration_cor.rds")
fit_egpt = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_go_past_time_cor.rds")
fit_etd = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_total_duration_cor.rds")
fit_ereg = readRDS("./bayesian_models/bayesian_models_correlation/ranked_eyetr_eyetr_FPReg_cor.rds")
print('---------------------------- Gaze Duration--------------------------------------------')
[1] "---------------------------- Gaze Duration--------------------------------------------"
print(fit_egd)
Inference for Stan model: bivariate_correlation.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean     sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]           132.90    0.02   1.60    129.78    131.84    132.89    133.94    136.08  4438    1
mu[2]           122.64    0.02   1.41    119.84    121.71    122.66    123.58    125.43  4474    1
sigma[1]         73.79    0.02   1.17     71.50     73.01     73.78     74.56     76.15  3691    1
sigma[2]         65.41    0.02   1.04     63.37     64.74     65.40     66.09     67.51  3824    1
nu               16.48    0.04   2.69     12.28     14.56     16.17     18.01     22.73  4385    1
rho               0.92    0.00   0.00      0.92      0.92      0.92      0.93      0.93  5001    1
cov[1,1]       5446.60    2.85 173.42   5112.56   5330.91   5443.47   5558.73   5798.41  3708    1
cov[1,2]       4456.81    2.40 145.17   4176.71   4361.17   4454.59   4548.67   4748.93  3670    1
cov[2,1]       4456.81    2.40 145.17   4176.71   4361.17   4454.59   4548.67   4748.93  3670    1
cov[2,2]       4279.32    2.20 136.28   4015.83   4191.32   4277.37   4367.26   4557.54  3830    1
x_rand[1]       142.07    0.81  70.30     21.70     91.51    136.29    187.12    297.49  7616    1
x_rand[2]       130.86    0.71  61.94     22.87     86.11    127.74    170.53    262.81  7694    1
attempt           0.06    0.00   0.26      0.00      0.00      0.00      0.00      1.00  8003    1
max_attempts     10.00     NaN   0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -24130.71    0.03   1.78 -24135.13 -24131.62 -24130.37 -24129.41 -24128.31  3161    1

Samples were drawn using NUTS(diag_e) at Fri Aug 11 12:30:12 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- Go Past Time--------------------------------------------')
[1] "---------------------------- Go Past Time--------------------------------------------"
print(fit_egpt)
Inference for Stan model: bivariate_correlation.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean     sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]           158.88    0.03   2.18    154.63    157.39    158.87    160.38    163.18  4160    1
mu[2]           145.58    0.03   1.91    141.86    144.27    145.57    146.89    149.34  4249    1
sigma[1]         93.78    0.03   1.66     90.59     92.64     93.80     94.90     97.15  3659    1
sigma[2]         82.27    0.03   1.45     79.48     81.26     82.27     83.25     85.15  3180    1
nu                8.83    0.02   0.95      7.21      8.16      8.75      9.42     10.97  3630    1
rho               0.92    0.00   0.00      0.92      0.92      0.92      0.93      0.93  5399    1
cov[1,1]       8797.65    5.14 311.03   8206.42   8582.36   8797.95   9005.48   9437.99  3654    1
cov[1,2]       7124.68    4.37 256.74   6640.23   6943.57   7123.16   7299.14   7643.95  3458    1
cov[2,1]       7124.68    4.37 256.74   6640.23   6943.57   7123.16   7299.14   7643.95  3458    1
cov[2,2]       6770.74    4.24 239.06   6316.85   6603.30   6768.37   6931.34   7249.73  3172    1
x_rand[1]       178.07    1.04  92.17     24.26    112.53    169.84    234.18    379.79  7877    1
x_rand[2]       162.02    0.91  81.68     25.72    103.73    154.98    211.07    340.38  8056    1
attempt           0.08    0.00   0.30      0.00      0.00      0.00      0.00      1.00  8200    1
max_attempts     10.00     NaN   0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -25277.02    0.03   1.72 -25281.22 -25277.95 -25276.68 -25275.76 -25274.67  3158    1

Samples were drawn using NUTS(diag_e) at Fri Aug 11 12:36:02 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- Total Duration--------------------------------------------')
[1] "---------------------------- Total Duration--------------------------------------------"
print(fit_etd)
Inference for Stan model: bivariate_correlation.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean     sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]           185.81    0.03   2.07    181.78    184.42    185.77    187.21    189.91  4202    1
mu[2]           166.28    0.03   1.83    162.73    165.04    166.25    167.50    169.93  4211    1
sigma[1]         94.20    0.03   1.57     91.11     93.15     94.20     95.27     97.30  3447    1
sigma[2]         83.97    0.02   1.39     81.27     83.02     83.96     84.90     86.69  3383    1
nu               12.25    0.02   1.64      9.57     11.09     12.09     13.20     15.94  4615    1
rho               0.94    0.00   0.00      0.93      0.93      0.94      0.94      0.94  5000    1
cov[1,1]       8876.18    5.03 295.39   8301.65   8677.64   8873.28   9075.84   9466.62  3443    1
cov[1,2]       7402.10    4.37 250.04   6915.30   7234.88   7398.57   7570.50   7901.07  3274    1
cov[2,1]       7402.10    4.37 250.04   6915.30   7234.88   7398.57   7570.50   7901.07  3274    1
cov[2,2]       7052.58    4.00 232.76   6605.40   6892.44   7049.39   7208.70   7514.46  3380    1
x_rand[1]       195.43    1.04  92.36     34.49    130.88    189.65    253.14    395.07  7886    1
x_rand[2]       175.22    0.94  82.69     30.41    117.02    170.67    226.08    355.46  7783    1
attempt           0.05    0.00   0.22      0.00      0.00      0.00      0.00      1.00  7894    1
max_attempts     10.00     NaN   0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -25256.96    0.03   1.72 -25261.06 -25257.90 -25256.64 -25255.69 -25254.56  2928    1

Samples were drawn using NUTS(diag_e) at Fri Aug 11 13:07:03 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- First Pass Regression Prob.--------------------------------------------')
[1] "---------------------------- First Pass Regression Prob.--------------------------------------------"
print(fit_ereg)
Inference for Stan model: bivariate_correlation.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean       sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]          1255.64    0.12    10.80   1234.54   1248.36   1255.74   1262.93   1276.79  8627    1
mu[2]          1252.93    0.12    10.87   1231.37   1245.54   1253.06   1260.17   1274.30  8702    1
sigma[1]        765.79    0.09     8.39    749.70    760.10    765.62    771.49    782.53  9017    1
sigma[2]        770.39    0.09     8.39    754.19    764.66    770.34    775.92    787.09  8416    1
nu              158.18    0.28    29.26    108.58    137.85    155.21    176.28    223.29 10599    1
rho               0.63    0.00     0.01      0.61      0.63      0.63      0.64      0.66  9459    1
cov[1,1]     586505.54  135.48 12860.89 562043.91 577744.85 586178.32 595202.01 612347.58  9011    1
cov[1,2]     373981.79  127.91 11325.06 351915.15 366391.42 373981.76 381542.13 396695.84  7839    1
cov[2,1]     373981.79  127.91 11325.06 351915.15 366391.42 373981.76 381542.13 396695.84  7839    1
cov[2,2]     593566.17  140.99 12933.82 568805.46 584710.34 593422.01 602051.63 619514.48  8415    1
x_rand[1]      1373.55    7.61   682.32    169.56    876.21   1347.83   1820.73   2770.16  8038    1
x_rand[2]      1385.00    7.74   690.70    179.87    876.47   1358.67   1839.94   2853.36  7965    1
attempt           0.09    0.00     0.32      0.00      0.00      0.00      0.00      1.00  7869    1
max_attempts     10.00     NaN     0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -80031.49    0.03     1.74 -80035.79 -80032.39 -80031.19 -80030.22 -80029.08  3690    1

Samples were drawn using NUTS(diag_e) at Wed Aug  9 22:01:28 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
# stan_trace(fit_egd, pars=c("rho", "mu", "sigma", "nu"))
# stan_dens(fit_egd, pars=c("rho", "mu", "sigma", "nu"), separate_chains = TRUE)
# stan_plot(fit_egd, pars=c("rho", "mu", "sigma", "nu"))

# Gaze Duration
stan_trace(fit_egd)
stan_dens(fit_egd, separate_chains = TRUE)
stan_plot(fit_egd)

# Go Past Time
stan_trace(fit_egpt)
stan_dens(fit_egpt, separate_chains = TRUE)
stan_plot(fit_egpt)

# Total Duration
stan_trace(fit_etd)
stan_dens(fit_etd, separate_chains = TRUE)
stan_plot(fit_etd)

# FPReg
stan_trace(fit_ereg)
stan_dens(fit_ereg, separate_chains = TRUE)
stan_plot(fit_ereg)
print('---------------------------- Gaze Duration--------------------------------------------')
[1] "---------------------------- Gaze Duration--------------------------------------------"
rho_egd = as.numeric(extract(fit_egd, "rho")[[1]])
mean = mean(rho_egd)
crI = quantile(rho_egd, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_egd), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.9231
HPD: [0.917, 0.9292]
crI: [0.9167, 0.9291]
print('---------------------------- Go Past Time--------------------------------------------')
[1] "---------------------------- Go Past Time--------------------------------------------"
rho_egpt = as.numeric(extract(fit_egpt, "rho")[[1]])
mean = mean(rho_egpt)
crI = quantile(rho_egpt, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_egpt), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.9231
HPD: [0.9169, 0.9294]
crI: [0.9166, 0.9292]
print('---------------------------- Total Duration--------------------------------------------')
[1] "---------------------------- Total Duration--------------------------------------------"
rho_etd = as.numeric(extract(fit_etd, "rho")[[1]])
mean = mean(rho_etd)
crI = quantile(rho_etd, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_etd), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.9355
HPD: [0.9301, 0.9406]
crI: [0.9301, 0.9406]
print('---------------------------- First Pass Regression --------------------------------------------')
[1] "---------------------------- First Pass Regression --------------------------------------------"
rho_ereg = as.numeric(extract(fit_ereg, "rho")[[1]])
mean = mean(rho_ereg)
crI = quantile(rho_ereg, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_ereg), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]")
Mean: 0.6338
HPD: [0.6095, 0.6552]
crI: [0.61, 0.6558]
print('---------------------------- Gaze Duration--------------------------------------------')
egd_rand <- extract(fit_egd, "x_rand")[[1]]
# x_rand_filtered <- x_rand[apply(x_rand, 1, function(x) all(x > 0)),]
# x_rand_filtered

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 400), ylim=c(0, 700), type="n",
     xlab = "Eye tracking value 1", ylab = "Eye tracking value 2", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(egd_rand[,1], egd_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(egd_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(egd_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(egd_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- Go Past Time--------------------------------------------')
egpt_rand <- extract(fit_egpt, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 1200), type="n",
     xlab = "Eye tracking value 1", ylab = "Eye tracking value 2", main = "Go Past Time") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(egpt_rand[,1], egpt_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(egpt_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(egpt_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(egpt_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- Total Duration--------------------------------------------')
etd_rand <- extract(fit_etd, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 1200), type="n",
     xlab = "Eye tracking value 1", ylab = "Eye tracking value 2", main = "Total Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(etd_rand[,1], etd_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(etd_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(etd_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(etd_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression --------------------------------------------')
ereg_rand <- extract(fit_ereg, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value 1", ylab = "Eye tracking value 2", main = "First Pass Regression") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(ereg_rand[,1], ereg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(ereg_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(ereg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(ereg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

Bayesian – correlation between MoTR and word level statistics

print("Log Frequency")
[1] "Log Frequency"
stats_cor_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(stats_cor_df$motr_value, stats_cor_df$freq)$estimate)
    cor 
-0.7454 
print(cor.test(stats_cor_df$eyetr_value, stats_cor_df$freq)$estimate)
    cor 
-0.8069 
# View(stats_cor_df)
stats_cor_df %>% 
  gather(measure, value, c(7, 13)) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

mfreq_temp <- stats_cor_df[c("motr_value", "freq")] %>%
  data.matrix()
efreq_temp <- stats_cor_df[c("eyetr_value", "freq")] %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))

# Plot the first data matrix gd_temp
plot(mfreq_temp, pch = 16, col = "blue",
     main = "MoTR RTs and Word Frequency")

# Plot the first data matrix gd_temp
plot(efreq_temp, pch = 16, col = "blue",
     main = "EyeTR RTs and Word Frequency")

motr & frequency

mfreq_data = list(x=mfreq_temp, N=nrow(mfreq_temp))

fit_mfreq = stan(
  file="stan_models/stats_correlation.stan", 
  data=mfreq_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_mfreq@stanmodel@dso <- new("cxxdso")
saveRDS(fit_mfreq, file = paste0("./bayesian_models/bayesian_models_correlation/motr_freq_cor.rds"))

eyetr & frequency

efreq_data = list(x=efreq_temp, N=nrow(efreq_temp))

fit_efreq = stan(
  file="stan_models/stats_correlation.stan", 
  data=efreq_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_efreq@stanmodel@dso <- new("cxxdso")
saveRDS(fit_efreq, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_freq_cor.rds"))
print("Length")
[1] "Length"
stats_cor_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(stats_cor_df$motr_value, stats_cor_df$len)$estimate)
   cor 
0.8644 
print(cor.test(stats_cor_df$eyetr_value, stats_cor_df$len)$estimate)
   cor 
0.8597 
# View(stats_cor_df)
stats_cor_df %>% 
  gather(measure, value, c(9, 13)) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

mlen_temp <- stats_cor_df[c("motr_value", "len")] %>%
  data.matrix()
elen_temp <- stats_cor_df[c("eyetr_value", "len")] %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))

# Plot the first data matrix gd_temp
plot(mlen_temp, pch = 16, col = "blue",
     main = "MoTR RTs and Word Length")

# Plot the first data matrix gd_temp
plot(elen_temp, pch = 16, col = "blue",
     main = "EyeTR RTs and Word Length")

motr & length

mlen_data = list(x=mlen_temp, N=nrow(mlen_temp))

fit_mlen = stan(
  file="stan_models/stats_correlation_len_normal.stan", 
  data=mlen_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_mlen@stanmodel@dso <- new("cxxdso")
saveRDS(fit_mlen, file = paste0("./bayesian_models/bayesian_models_correlation/motr_len_cor.rds"))

eyetr & length

elen_data = list(x=elen_temp, N=nrow(elen_temp))

fit_elen = stan(
  file="stan_models/stats_correlation_len_normal.stan", 
  data=elen_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_elen@stanmodel@dso <- new("cxxdso")
saveRDS(fit_elen, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_len_cor.rds"))
print("Surprisal")
[1] "Surprisal"
stats_cor_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(stats_cor_df$motr_value, stats_cor_df$surp)$estimate)
   cor 
0.4978 
print(cor.test(stats_cor_df$eyetr_value, stats_cor_df$surp)$estimate)
   cor 
0.5683 
# View(stats_cor_df)
stats_cor_df %>% 
  gather(measure, value, c(8, 13)) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

msurp_temp <- stats_cor_df[c("motr_value", "surp")] %>%
  data.matrix()
esurp_temp <- stats_cor_df[c("eyetr_value", "surp")] %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))

# Plot the first data matrix gd_temp
plot(msurp_temp, pch = 16, col = "blue",
     main = "MoTR RTs and Surprisal")

# Plot the first data matrix gd_temp
plot(esurp_temp, pch = 16, col = "blue",
     main = "EyeTR RTs and Surprisal")

motr & surprisal

msurp_data = list(x=msurp_temp, N=nrow(msurp_temp))

fit_msurp = stan(
  file="stan_models/stats_correlation.stan", 
  data=msurp_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_msurp@stanmodel@dso <- new("cxxdso")
saveRDS(fit_msurp, file = paste0("./bayesian_models/bayesian_models_correlation/motr_surp_cor.rds"))

eyetr & surprisal

esurp_data = list(x=esurp_temp, N=nrow(esurp_temp))

fit_esurp = stan(
  file="stan_models/stats_correlation.stan", 
  data=esurp_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_esurp@stanmodel@dso <- new("cxxdso")
saveRDS(fit_esurp, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_surp_cor.rds"))
fit_mfreq = readRDS("./bayesian_models/bayesian_models_correlation/motr_freq_cor.rds")
fit_efreq = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_freq_cor.rds")
fit_mlen = readRDS("./bayesian_models/bayesian_models_correlation/motr_len_cor.rds")
fit_elen = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_len_cor.rds")
fit_msurp = readRDS("./bayesian_models/bayesian_models_correlation/motr_surp_cor.rds")
fit_esurp = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_surp_cor.rds")

print('---------------------------- MoTR & Log Frequency --------------------------------------------')
[1] "---------------------------- MoTR & Log Frequency --------------------------------------------"
print(fit_mfreq)
Inference for Stan model: stats_correlation.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean     sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]           206.16    0.02   1.93    202.44    204.86    206.17    207.47    209.88  8522    1
mu[2]             5.80    0.00   0.02      5.77      5.79      5.80      5.81      5.84  8983    1
sigma[1]        138.91    0.02   1.50    136.02    137.87    138.92    139.91    141.86  8278    1
sigma[2]          1.34    0.00   0.01      1.32      1.33      1.34      1.35      1.37  8274    1
nu               99.82    0.24  23.02     62.37     83.46     97.35    113.85    150.17  9319    1
rho              -0.76    0.00   0.01     -0.78     -0.77     -0.76     -0.76     -0.75  8377    1
cov[1,1]      19299.02    4.59 417.70  18501.76  19006.99  19298.28  19575.75  20123.93  8268    1
cov[1,2]       -141.93    0.04   3.34   -148.59   -144.18   -141.93   -139.65   -135.44  6864    1
cov[2,1]       -141.93    0.04   3.34   -148.59   -144.18   -141.93   -139.65   -135.44  6864    1
cov[2,2]          1.80    0.00   0.04      1.73      1.78      1.80      1.83      1.87  8265    1
x_rand[1]       227.30    1.38 121.53     20.78    136.69    221.29    306.64    484.97  7799    1
x_rand[2]         5.66    0.01   1.26      3.09      4.82      5.70      6.53      8.03  8041    1
attempt           0.07    0.00   0.27      0.00      0.00      0.00      0.00      1.00  7926    1
max_attempts     10.00     NaN   0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -36159.81    0.03   1.73 -36164.02 -36160.74 -36159.48 -36158.53 -36157.45  3770    1

Samples were drawn using NUTS(diag_e) at Sun Jul 23 15:18:27 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- EyeTR & Log Frequency --------------------------------------------')
[1] "---------------------------- EyeTR & Log Frequency --------------------------------------------"
print(fit_efreq)
Inference for Stan model: stats_correlation.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean     sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]           129.58    0.01   0.95    127.71    128.96    129.59    130.21    131.45  7967    1
mu[2]             5.79    0.00   0.02      5.75      5.78      5.79      5.80      5.82  8371    1
sigma[1]         73.11    0.01   0.75     71.66     72.60     73.10     73.61     74.61  7440    1
sigma[2]          1.34    0.00   0.01      1.31      1.33      1.34      1.35      1.36  7478    1
nu               91.76    0.24  21.94     56.83     76.18     89.08    104.50    142.84  8551    1
rho              -0.82    0.00   0.01     -0.83     -0.82     -0.82     -0.81     -0.81  8039    1
cov[1,1]       5345.65    1.28 110.19   5134.97   5270.29   5343.14   5418.78   5566.32  7432    1
cov[1,2]        -80.04    0.02   1.70    -83.43    -81.17    -80.03    -78.89    -76.73  6431    1
cov[2,1]        -80.04    0.02   1.70    -83.43    -81.17    -80.03    -78.89    -76.73  6431    1
cov[2,2]          1.79    0.00   0.04      1.72      1.77      1.79      1.81      1.86  7476    1
x_rand[1]       137.10    0.77  67.73     16.30     87.55    134.62    182.36    275.36  7640    1
x_rand[2]         5.68    0.01   1.27      3.13      4.82      5.68      6.55      8.11  7779    1
attempt           0.04    0.00   0.19      0.00      0.00      0.00      0.00      1.00  8076    1
max_attempts     10.00     NaN   0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -32784.74    0.03   1.72 -32789.05 -32785.64 -32784.43 -32783.48 -32782.37  4038    1

Samples were drawn using NUTS(diag_e) at Sun Jul 23 15:44:55 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- MoTR & Length --------------------------------------------')
[1] "---------------------------- MoTR & Length --------------------------------------------"
print(fit_mlen)
Inference for Stan model: stats_correlation_len_normal.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean     sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]           197.87    0.03   2.04    193.81    196.48    197.88    199.27    201.86  4675    1
mu[2]             4.40    0.00   0.04      4.33      4.38      4.40      4.43      4.48  4776    1
sigma[1]        136.99    0.03   1.75    133.57    135.79    136.99    138.19    140.43  4848    1
sigma[2]          2.51    0.00   0.03      2.45      2.49      2.51      2.53      2.58  4665    1
nu               17.03    0.04   2.84     12.51     15.05     16.64     18.61     23.71  4796    1
rho               0.89    0.00   0.00      0.88      0.88      0.89      0.89      0.89  5940    1
cov[1,1]      18768.69    6.88 479.04  17841.00  18438.93  18766.08  19096.47  19721.77  4845    1
cov[1,2]        305.06    0.12   7.91    289.81    299.65    305.00    310.39    320.87  4341    1
cov[2,1]        305.06    0.12   7.91    289.81    299.65    305.00    310.39    320.87  4341    1
cov[2,2]          6.31    0.00   0.17      5.98      6.19      6.30      6.42      6.64  4664    1
x_rand[1]       219.00    1.39 122.76     20.11    127.61    208.39    297.11    488.02  7751    1
x_rand[2]         5.26    0.03   2.36      1.00      4.00      5.00      7.00     10.00  7861    1
attempt           0.10    0.00   0.33      0.00      0.00      0.00      0.00      1.00  7797    1
max_attempts     10.00     NaN   0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -38437.19    0.03   1.75 -38441.43 -38438.10 -38436.86 -38435.92 -38434.79  3210    1

Samples were drawn using NUTS(diag_e) at Sun Jul 23 19:35:18 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- EyeTR & Length --------------------------------------------')
[1] "---------------------------- EyeTR & Length --------------------------------------------"
print(fit_elen)
Inference for Stan model: stats_correlation_len_normal.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean     sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]           125.47    0.01   1.05    123.48    124.75    125.46    126.18    127.53  5037    1
mu[2]             4.45    0.00   0.04      4.38      4.43      4.45      4.48      4.52  4809    1
sigma[1]         72.58    0.01   0.88     70.87     71.98     72.57     73.15     74.31  4816    1
sigma[2]          2.49    0.00   0.03      2.42      2.47      2.49      2.51      2.55  4786    1
nu               17.29    0.04   2.69     13.08     15.35     16.97     18.80     23.46  4894    1
rho               0.88    0.00   0.00      0.87      0.88      0.88      0.89      0.89  5800    1
cov[1,1]       5268.20    1.84 127.51   5022.34   5181.60   5266.78   5351.59   5522.02  4819    1
cov[1,2]        159.48    0.06   3.98    151.79    156.77    159.44    162.13    167.48  4232    1
cov[2,1]        159.48    0.06   3.98    151.79    156.77    159.44    162.13    167.48  4232    1
cov[2,2]          6.19    0.00   0.16      5.88      6.08      6.19      6.30      6.51  4790    1
x_rand[1]       135.19    0.75  67.87     18.34     85.18    132.06    179.13    278.49  8296    1
x_rand[2]         5.29    0.03   2.41      1.00      4.00      5.00      7.00     10.00  8133    1
attempt           0.06    0.00   0.24      0.00      0.00      0.00      0.00      1.00  7560    1
max_attempts     10.00     NaN   0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -35403.80    0.03   1.76 -35408.14 -35404.73 -35403.49 -35402.51 -35401.38  3435    1

Samples were drawn using NUTS(diag_e) at Sun Jul 23 19:55:18 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- MoTR & Surprisal --------------------------------------------')
[1] "---------------------------- MoTR & Surprisal --------------------------------------------"
print(fit_msurp)
Inference for Stan model: stats_correlation.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean     sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]           193.29    0.02   2.22    189.00    191.80    193.27    194.83    197.56  7883    1
mu[2]             6.02    0.00   0.08      5.86      5.97      6.02      6.08      6.19  8202    1
sigma[1]        135.56    0.02   1.87    132.03    134.29    135.53    136.83    139.29  6733    1
sigma[2]          5.10    0.00   0.08      4.95      5.05      5.10      5.15      5.25  6738    1
nu               11.72    0.02   1.35      9.45     10.76     11.58     12.56     14.76  6506    1
rho               0.57    0.00   0.01      0.54      0.56      0.57      0.58      0.60  7907    1
cov[1,1]      18380.00    6.19 508.20  17430.66  18032.66  18367.66  18723.49  19401.81  6740    1
cov[1,2]        396.36    0.20  15.81    365.40    385.79    396.30    407.09    427.00  6245    1
cov[2,1]        396.36    0.20  15.81    365.40    385.79    396.30    407.09    427.00  6245    1
cov[2,2]         25.98    0.01   0.78     24.50     25.45     25.98     26.51     27.56  6750    1
x_rand[1]       229.82    1.41 125.86     23.36    137.77    220.32    308.08    504.67  7929    1
x_rand[2]         7.63    0.05   4.51      0.66      4.30      7.12     10.37     17.97  7826    1
attempt           0.21    0.01   0.51      0.00      0.00      0.00      0.00      2.00  7854    1
max_attempts     10.00     NaN   0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -43211.31    0.03   1.76 -43215.70 -43212.20 -43210.97 -43210.02 -43208.89  3913    1

Samples were drawn using NUTS(diag_e) at Sun Jul 23 20:47:52 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- EyeTR & Surprisal --------------------------------------------')
[1] "---------------------------- EyeTR & Surprisal --------------------------------------------"
print(fit_esurp)
Inference for Stan model: stats_correlation.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                  mean se_mean     sd      2.5%       25%       50%       75%     97.5% n_eff Rhat
mu[1]           123.52    0.01   1.17    121.28    122.73    123.52    124.30    125.81  7205    1
mu[2]             6.14    0.00   0.08      5.97      6.08      6.13      6.19      6.30  7266    1
sigma[1]         72.79    0.01   0.95     70.95     72.15     72.77     73.43     74.64  6894    1
sigma[2]          5.16    0.00   0.07      5.01      5.11      5.16      5.21      5.30  5938    1
nu               14.77    0.03   1.97     11.49     13.38     14.56     15.97     19.18  5988    1
rho               0.63    0.00   0.01      0.61      0.62      0.63      0.64      0.66  8585    1
cov[1,1]       5299.28    1.66 138.13   5033.28   5205.63   5295.99   5392.41   5571.18  6894    1
cov[1,2]        237.27    0.11   8.26    221.19    231.62    237.24    242.84    253.80  6152    1
cov[2,1]        237.27    0.11   8.26    221.19    231.62    237.24    242.84    253.80  6152    1
cov[2,2]         26.59    0.01   0.77     25.10     26.07     26.58     27.11     28.14  5952    1
x_rand[1]       141.12    0.80  69.27     22.01     91.25    136.11    184.96    290.51  7508    1
x_rand[2]         7.68    0.05   4.55      0.67      4.31      7.15     10.45     17.96  7265    1
attempt           0.17    0.00   0.45      0.00      0.00      0.00      0.00      1.00  8132    1
max_attempts     10.00     NaN   0.00     10.00     10.00     10.00     10.00     10.00   NaN  NaN
lp__         -40032.38    0.03   1.76 -40036.66 -40033.33 -40032.04 -40031.08 -40029.99  3594    1

Samples were drawn using NUTS(diag_e) at Sun Jul 23 21:06:56 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
# MoTR & Log Freq
stan_trace(fit_mfreq)
stan_dens(fit_mfreq, separate_chains = TRUE)
stan_plot(fit_mfreq)

# EyeTR & Log Freq
stan_trace(fit_efreq)
stan_dens(fit_efreq, separate_chains = TRUE)
stan_plot(fit_efreq)

# MoTR & Len
stan_trace(fit_mlen)
stan_dens(fit_mlen, separate_chains = TRUE)
stan_plot(fit_mlen)

# EyeTR & Len
stan_trace(fit_elen)
stan_dens(fit_elen, separate_chains = TRUE)
stan_plot(fit_elen)

# MoTR & Surprisal
stan_trace(fit_msurp)
stan_dens(fit_msurp, separate_chains = TRUE)
stan_plot(fit_msurp)

# EyeTR & Surprisal
stan_trace(fit_esurp)
stan_dens(fit_esurp, separate_chains = TRUE)
stan_plot(fit_esurp)
print('---------------------------- MoTR & Log Freq --------------------------------------------')
[1] "---------------------------- MoTR & Log Freq --------------------------------------------"
rho_mfreq = as.numeric(extract(fit_mfreq, "rho")[[1]])
mean = mean(rho_mfreq)
crI = quantile(rho_mfreq, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mfreq), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: -0.7613
HPD: [-0.776, -0.7461]
crI: [-0.7762, -0.7462]
print('---------------------------- EyeTR & Log Freq --------------------------------------------')
[1] "---------------------------- EyeTR & Log Freq --------------------------------------------"
rho_efreq = as.numeric(extract(fit_efreq, "rho")[[1]])
mean = mean(rho_efreq)
crI = quantile(rho_efreq, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_efreq), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: -0.8182
HPD: [-0.8297, -0.8065]
crI: [-0.8295, -0.8062]
print('---------------------------- MoTR & Length --------------------------------------------')
[1] "---------------------------- MoTR & Length --------------------------------------------"
rho_mlen = as.numeric(extract(fit_mlen, "rho")[[1]])
mean = mean(rho_mlen)
crI = quantile(rho_mlen, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mlen), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.8867
HPD: [0.8784, 0.8943]
crI: [0.8784, 0.8943]
print('---------------------------- EyeTR & Length --------------------------------------------')
[1] "---------------------------- EyeTR & Length --------------------------------------------"
rho_elen = as.numeric(extract(fit_elen, "rho")[[1]])
mean = mean(rho_elen)
crI = quantile(rho_elen, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_elen), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.883
HPD: [0.8743, 0.8908]
crI: [0.8746, 0.8911]
print('---------------------------- MoTR & Surprisal --------------------------------------------')
[1] "---------------------------- MoTR & Surprisal --------------------------------------------"
rho_msurp = as.numeric(extract(fit_msurp, "rho")[[1]])
mean = mean(rho_msurp)
crI = quantile(rho_msurp, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_msurp), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.5735
HPD: [0.5447, 0.6033]
crI: [0.5432, 0.602]
print('---------------------------- EyeTR & Surprisal --------------------------------------------')
[1] "---------------------------- EyeTR & Surprisal --------------------------------------------"
rho_esurp = as.numeric(extract(fit_esurp, "rho")[[1]])
mean = mean(rho_esurp)
crI = quantile(rho_esurp, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_esurp), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]")
Mean: 0.632
HPD: [0.6075, 0.6574]
crI: [0.6066, 0.6569]
print('---------------------------- MoTR & Log Frequency--------------------------------------------')
mfreq_rand <- extract(fit_mfreq, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 12), type="n",
     xlab = "MoTR value", ylab = "Log Frequency", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mfreq_rand[,1], mfreq_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(mfreq_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mfreq_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mfreq_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- EyeTR & Log Frequency--------------------------------------------')
efreq_rand <- extract(fit_efreq, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 500), ylim=c(0, 12), type="n",
     xlab = "Eye tracking value", ylab = "Log Frequency", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(efreq_rand[,1], efreq_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(efreq_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(efreq_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(efreq_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- MoTR & Length --------------------------------------------')
mlen_rand <- extract(fit_mlen, "x_rand")[[1]]
# mlen_rand

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 20), type="n",
     xlab = "MoTR value", ylab = "Word Length", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mlen_rand[,1], mlen_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(mlen_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mlen_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mlen_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- EyeTR & Length --------------------------------------------')
elen_rand <- extract(fit_elen, "x_rand")[[1]]
# elen_rand

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 20), type="n",
     xlab = "EyeTR value", ylab = "Word Length", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(elen_rand[,1], elen_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(elen_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(elen_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(elen_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- MoTR & Surprisal --------------------------------------------')
msurp_rand <- extract(fit_msurp, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 20), type="n",
     xlab = "MoTR value", ylab = "Word Surprisal", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(msurp_rand [,1], msurp_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(msurp_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(msurp_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(msurp_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- EyeTR & Surprisal --------------------------------------------')
esurp_rand <- extract(fit_esurp, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 20), type="n",
     xlab = "EyeTR value", ylab = "Word Surprisal", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(esurp_rand [,1], esurp_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(esurp_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(esurp_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(esurp_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")
print("EyeTR vs. EyeTR Fisrt Pass Regression Prob. < 0.3 ")
[1] "EyeTR vs. EyeTR Fisrt Pass Regression Prob. < 0.3 "
ereg_df = provo_eyetr_grouped_df %>% filter(metric == "FPReg") %>% distinct() %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value)) %>%
  filter(!(row_number() %in% c(443, 444, 445, 446))) %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1e-5),
         eyetr_value_2 = pmax(value_2, 1e-5))

ereg_df_low = provo_eyetr_grouped_df %>% filter(metric == "FPReg") %>% distinct() %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value)) %>%
  filter(!(row_number() %in% c(443, 444, 445, 446))) %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1e-5),
         eyetr_value_2 = pmax(value_2, 1e-5)) %>%
  filter(eyetr_value_1 < 0.3)
# View(ereg_df_low)

ereg_df_high = provo_eyetr_grouped_df %>% filter(metric == "FPReg") %>% distinct() %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value)) %>%
  filter(!(row_number() %in% c(443, 444, 445, 446))) %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1e-5),
         eyetr_value_2 = pmax(value_2, 1e-5)) %>%
  filter(eyetr_value_1 >= 0.3)
# View(ereg_df_high) 

print(cor.test(ereg_df$eyetr_value_1, ereg_df$eyetr_value_2)$estimate)
  cor 
0.741 
print(cor.test(ereg_df$eyetr_value_1, ereg_df$eyetr_value_2)$p.value)
[1] 0
print(cor.test(ereg_df_low$eyetr_value_1, ereg_df_low$eyetr_value_2)$estimate)
   cor 
0.4586 
print(cor.test(ereg_df_low$eyetr_value_1, ereg_df_low$eyetr_value_2)$p.value)
[1] 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000293
print(cor.test(ereg_df_high$eyetr_value_1, ereg_df_high$eyetr_value_2)$estimate)
   cor 
0.6653 
print(cor.test(ereg_df_high$eyetr_value_1, ereg_df_high$eyetr_value_2)$p.value)
[1] 0.000000000000000000000000000000000000000002144
# View(egd_df)

ereg_df %>% 
  gather(measure, value, 5:6) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

ereg_temp <- ereg_df[c("eyetr_value_1", "eyetr_value_2")] %>%
  drop_na() %>%
  data.matrix()
ereg_temp_low <- ereg_df_low[c("eyetr_value_1", "eyetr_value_2")] %>%
  drop_na() %>%
  data.matrix()
ereg_temp_high <- ereg_df_high[c("eyetr_value_1", "eyetr_value_2")] %>%
  drop_na() %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 3))

# Plot the first data matrix gd_temp
plot(ereg_temp, pch = 16, col = "blue",
     main = "FPReg all data Not Log-Transformed")
plot(ereg_temp_low, pch = 16, col = "blue",
     main = "FPReg < 0.3 Not Log-Transformed")
plot(ereg_temp_high, pch = 16, col = "blue",
     main = "FPReg > 0.3 Not Log-Transformed")

# -------fit model eyetr vs. eyetr FPReg <0.3 & >=0.3 ----------
reg_data = list(x=ereg_temp, N=nrow(ereg_temp))
fit_reg = stan(
  # file="stan_models/bivariate_beta_correlation_reg.stan", 
  file = "stan_models/bivariate_normal_reg.stan",
  data=reg_data, 
  iter=4000, 
  chains=4, 
  cores=4,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_reg@stanmodel@dso <- new("cxxdso")
saveRDS(fit_reg, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_FPReg_cor_all_data.rds"))

exploratory: divide eye tracking regression data into two parts.

# fit_ereg_all = readRDS("./eyetr_eyetr_FPReg_cor_all_data.rds")
fit_ereg_all = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_FPReg_cor.rds")
fit_ereg_low = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_FPReg_cor_00-03.rds")
fit_ereg_high = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_FPReg_cor_03-1.rds")

print('---------------------------- First Pass Regression Prob. all data --------------------------------------------')
[1] "---------------------------- First Pass Regression Prob. all data --------------------------------------------"
print(fit_ereg_all)
Inference for Stan model: bivariate_normal_reg.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                 mean se_mean   sd     2.5%      25%      50%      75%    97.5% n_eff Rhat
mu[1]            0.11    0.00 0.00     0.10     0.10     0.11     0.11     0.11  7352    1
mu[2]            0.11    0.00 0.00     0.10     0.11     0.11     0.11     0.11  6993    1
sigma[1]         0.11    0.00 0.00     0.10     0.11     0.11     0.11     0.11  5584    1
sigma[2]         0.10    0.00 0.00     0.10     0.10     0.10     0.10     0.11  5441    1
nu               3.16    0.00 0.13     2.92     3.07     3.16     3.24     3.42  6221    1
rho              0.73    0.00 0.01     0.71     0.73     0.73     0.74     0.76  7209    1
cov[1,1]         0.01    0.00 0.00     0.01     0.01     0.01     0.01     0.01  5581    1
cov[1,2]         0.01    0.00 0.00     0.01     0.01     0.01     0.01     0.01  4877    1
cov[2,1]         0.01    0.00 0.00     0.01     0.01     0.01     0.01     0.01  4877    1
cov[2,2]         0.01    0.00 0.00     0.01     0.01     0.01     0.01     0.01  5442    1
x_rand[1]        0.17    0.00 0.14     0.01     0.08     0.14     0.22     0.47  7543    1
x_rand[2]        0.17    0.00 0.14     0.01     0.08     0.14     0.21     0.46  8201    1
attempt          0.37    0.01 0.70     0.00     0.00     0.00     1.00     2.00  7941    1
max_attempts    10.00     NaN 0.00    10.00    10.00    10.00    10.00    10.00   NaN  NaN
lp__         11696.50    0.03 1.76 11692.31 11695.54 11696.82 11697.81 11698.90  3562    1

Samples were drawn using NUTS(diag_e) at Sun Jul 23 01:21:18 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- First Pass Regression Prob.< 0.3--------------------------------------------')
[1] "---------------------------- First Pass Regression Prob.< 0.3--------------------------------------------"
print(fit_ereg_low)
Inference for Stan model: bivariate_normal_reg.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

                mean se_mean   sd    2.5%     25%     50%     75%   97.5% n_eff Rhat
mu[1]           0.11    0.00 0.00    0.11    0.11    0.11    0.11    0.12  7590    1
mu[2]           0.12    0.00 0.00    0.12    0.12    0.12    0.12    0.12  6828    1
sigma[1]        0.08    0.00 0.00    0.07    0.08    0.08    0.08    0.08  7489    1
sigma[2]        0.09    0.00 0.00    0.08    0.08    0.09    0.09    0.09  6449    1
nu             24.44    0.08 6.00   16.09   20.27   23.37   27.29   39.52  5520    1
rho             0.47    0.00 0.02    0.44    0.46    0.47    0.49    0.51  8485    1
cov[1,1]        0.01    0.00 0.00    0.01    0.01    0.01    0.01    0.01  7484    1
cov[1,2]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  6928    1
cov[2,1]        0.00    0.00 0.00    0.00    0.00    0.00    0.00    0.00  6928    1
cov[2,2]        0.01    0.00 0.00    0.01    0.01    0.01    0.01    0.01  6439    1
x_rand[1]       0.13    0.00 0.07    0.01    0.08    0.12    0.17    0.28  7848    1
x_rand[2]       0.14    0.00 0.08    0.01    0.08    0.13    0.19    0.31  7951    1
attempt         0.17    0.00 0.44    0.00    0.00    0.00    0.00    1.00  7714    1
max_attempts   10.00     NaN 0.00   10.00   10.00   10.00   10.00   10.00   NaN  NaN
lp__         7720.01    0.03 1.74 7715.76 7719.08 7720.31 7721.28 7722.38  3981    1

Samples were drawn using NUTS(diag_e) at Sun Aug  6 00:32:16 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
print('---------------------------- First Pass Regression Prob.>= 0.3--------------------------------------------')
[1] "---------------------------- First Pass Regression Prob.>= 0.3--------------------------------------------"
print(fit_ereg_high)
Inference for Stan model: bivariate_normal_reg.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

               mean se_mean    sd   2.5%    25%    50%    75%  97.5% n_eff Rhat
mu[1]          0.44    0.00  0.01   0.43   0.44   0.44   0.45   0.46  5432    1
mu[2]          0.37    0.00  0.01   0.35   0.36   0.37   0.38   0.39  6127    1
sigma[1]       0.12    0.00  0.01   0.11   0.11   0.12   0.12   0.13  5531    1
sigma[2]       0.16    0.00  0.01   0.15   0.16   0.16   0.17   0.18  5479    1
nu            28.02    0.17 12.90  11.34  18.95  25.20  34.27  60.04  6081    1
rho            0.67    0.00  0.03   0.60   0.65   0.67   0.69   0.73  6652    1
cov[1,1]       0.01    0.00  0.00   0.01   0.01   0.01   0.01   0.02  5539    1
cov[1,2]       0.01    0.00  0.00   0.01   0.01   0.01   0.01   0.02  4763    1
cov[2,1]       0.01    0.00  0.00   0.01   0.01   0.01   0.01   0.02  4763    1
cov[2,2]       0.03    0.00  0.00   0.02   0.02   0.03   0.03   0.03  5491    1
x_rand[1]      0.44    0.00  0.12   0.21   0.36   0.45   0.53   0.68  7922    1
x_rand[2]      0.38    0.00  0.16   0.08   0.26   0.37   0.48   0.71  8307    1
attempt        0.02    0.00  0.13   0.00   0.00   0.00   0.00   0.00  8124    1
max_attempts  10.00     NaN  0.00  10.00  10.00  10.00  10.00  10.00   NaN  NaN
lp__         793.27    0.03  1.79 788.73 792.39 793.63 794.56 795.67  3841    1

Samples were drawn using NUTS(diag_e) at Sun Aug  6 00:36:26 2023.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
# # FPReg all data
stan_trace(fit_ereg_all)
'pars' not specified. Showing first 10 parameters by default.

stan_dens(fit_ereg_all, separate_chains = TRUE)
'pars' not specified. Showing first 10 parameters by default.

stan_plot(fit_ereg_all)
'pars' not specified. Showing first 10 parameters by default.
ci_level: 0.8 (80% intervals)
outer_level: 0.95 (95% intervals)

# # FPReg < 0.3
stan_trace(fit_ereg_low)
'pars' not specified. Showing first 10 parameters by default.

stan_dens(fit_ereg_low, separate_chains = TRUE)
'pars' not specified. Showing first 10 parameters by default.

stan_plot(fit_ereg_low)
'pars' not specified. Showing first 10 parameters by default.
ci_level: 0.8 (80% intervals)
outer_level: 0.95 (95% intervals)

# FPReg >= 0.3
stan_trace(fit_ereg_high)
'pars' not specified. Showing first 10 parameters by default.

stan_dens(fit_ereg_high, separate_chains = TRUE)
'pars' not specified. Showing first 10 parameters by default.

stan_plot(fit_ereg_high)
'pars' not specified. Showing first 10 parameters by default.
ci_level: 0.8 (80% intervals)
outer_level: 0.95 (95% intervals)

print('---------------------------- First Pass Regression all data--------------------------------------------')
[1] "---------------------------- First Pass Regression all data--------------------------------------------"
rho_ereg_all = as.numeric(extract(fit_ereg_all, "rho")[[1]])
mean = mean(rho_ereg_all)
crI = quantile(rho_ereg_all, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_ereg_all), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.7339
HPD: [0.7128, 0.7557]
crI: [0.7119, 0.755]
print('---------------------------- First Pass Regression < 0.3--------------------------------------------')
[1] "---------------------------- First Pass Regression < 0.3--------------------------------------------"
rho_ereg_low = as.numeric(extract(fit_ereg_low, "rho")[[1]])
mean = mean(rho_ereg_low)
crI = quantile(rho_ereg_low, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_ereg_low), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.4739
HPD: [0.4393, 0.5071]
crI: [0.4395, 0.5073]
print('---------------------------- First Pass Regression >= 0.3--------------------------------------------')
[1] "---------------------------- First Pass Regression >= 0.3--------------------------------------------"
rho_ereg_high = as.numeric(extract(fit_ereg_high, "rho")[[1]])
mean = mean(rho_ereg_high)
crI = quantile(rho_ereg_high, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_ereg_high), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
Mean: 0.6711
HPD: [0.607, 0.7342]
crI: [0.6033, 0.7315]
print('---------------------------- First Pass Regression all data --------------------------------------------')
[1] "---------------------------- First Pass Regression all data --------------------------------------------"
eallreg_rand <- extract(fit_ereg_all, "x_rand")[[1]]
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(eallreg_rand[,1], eallreg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(ereg_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(eallreg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(eallreg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")


print('---------------------------- First Pass Regression < 0.3 --------------------------------------------')
[1] "---------------------------- First Pass Regression < 0.3 --------------------------------------------"
elowreg_rand <- extract(fit_ereg_low, "x_rand")[[1]]
# print(elowreg_rand)
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(elowreg_rand[,1], elowreg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(ereg_temp_low, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(elowreg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(elowreg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")


print('---------------------------- First Pass Regression >= 0.3 --------------------------------------------')
[1] "---------------------------- First Pass Regression >= 0.3 --------------------------------------------"
ehighreg_rand_samples <- extract(fit_ereg_high, "x_rand")[[1]]
# print(mhighreg_rand_samples)
selected_indices <- sample(1:nrow(ehighreg_rand_samples), 900)
ehighreg_rand <- ehighreg_rand_samples[selected_indices, ]
# mhighreg_rand <- extract(fit_mreg_high, "x_rand")[[1]]
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(ehighreg_rand[,1], ehighreg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(ereg_temp_high, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(ehighreg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(ehighreg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

---
title: "Exploratory Analysis for MoTR Reading Data"
output: html_notebook
---

```{r}
shhh <- suppressPackageStartupMessages # It's a library, so shhh!

shhh(library( mgcv ))
shhh(library(dplyr))
shhh(library(ggplot2))
shhh(library(lme4))
shhh(library(tidymv))
shhh(library(gamlss))
shhh(library(gsubfn))
shhh(library(lmerTest))
shhh(library(tidyverse))
shhh(library(boot))
shhh(library(rsample))
shhh(library(plotrix))
shhh(library(ggrepel))
shhh(library(mgcv))

shhh(library(brms))
shhh(library(bayesplot))
shhh(library(patchwork))
shhh(library(MASS))
shhh(library(tidyr))
shhh(library(extraDistr))
shhh(library(purrr))
# For exercises with Stan code
shhh(library(rstan))
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = FALSE)

library(car)
library(coda)
shhh(library(gridExtra))

theme_set(theme_bw())
options(digits=4)
options(scipen=999)
set.seed(444)
pipe_message = function(.data, status) {message(status); .data}

```


# Read in MoTR Data

```{r}

rate = 160

file_prefix = "../data/provo_f160/"
fnames = list.files(path=file_prefix)

df = data.frame()
for (f in fnames) {
  temp = read.csv(paste0(file_prefix, "/", f)) %>%
    mutate(subj = str_remove(f, "_reading_measures.csv"))
  df = rbind(df, temp)
}

# Filter out readers who don't answer the comprehension questions correctly
filter_df = df %>%
  group_by(para_nr, subj) %>% summarise(correct = if_else(unique(correctness) == 1, 1, 0)) %>% ungroup() %>%
  drop_na() %>%
  group_by(subj) %>% summarise(p_correct = mean(correct)) %>% ungroup() %>%
  mutate(p_correct = round(p_correct, digits = 2))

filter_df = filter_df %>% filter(p_correct < 0.8)
filter_list = filter_df$subj
## reader_3:0.70, reader_60:0.79, reader_76:0.72 , reader_256:0.71 , reader_262:0.57 

raw_df = df %>%
  filter(! subj %in% c(filter_list)) %>%
  mutate(word = str_trim(word)) %>%
  mutate(subj = str_remove(subj, "reader_")) %>%
  mutate(subj = as.character(subj)) %>%
  # filter(! subj %in% c("3", "60", "76", "256", "262")) %>% # Explanation for this filtering
  mutate(FPReg = if_else(total_duration == 0, -1, FPReg)) %>% #If the word is skipped we can't say that it wasn't regressed on the first pass. Set to a "NA"
  dplyr::select(expr_id, cond_id, para_nr, word, word_nr, first_duration, total_duration, gaze_duration, go_past_time, FPReg, subj)

length(unique(raw_df$subj))

df %>%
  filter(! subj %in% c(filter_list)) %>%
  filter(FPReg >= 0) %>%
  dplyr::select(FPReg) %>%
  drop_na() %>%
  summarise( m = mean(FPReg))

df %>%
  filter(! subj %in% c(filter_list)) %>%
  dplyr::select(FPFix) %>%
  drop_na() %>%
  summarise( m = mean(FPFix))


```


```{r}
# Average across subjects
motr_agg_df = raw_df %>%
  gather(metric, value, 6:10) %>%
    filter(value >= 0) %>% #Removes the "NA" values for FPReg
  
    # ==== Remove skipped words
    # mutate(zero = if_else(metric != "FPReg" & value == 0,T, F)) %>%
    # filter(zero == F) %>%
  
    drop_na() %>%
    group_by(para_nr, word_nr, word, metric) %>% 
      mutate(outlier = if_else(metric != "FPReg" & value > (mean(value) + 3 * sd(value)), T, F)) %>% filter(outlier == F) %>%
      summarise(value = mean(value), nsubj = length(unique(subj))) %>%
  ungroup() %>%
  arrange(para_nr, word_nr) %>%
  rename(text_id = para_nr, word_text_idx = word_nr, motr_value = value)

```




# Comparison to Provo


```{r}
# Read in Provo surprisal, frequency and length data
provo_modeling_df = read.csv("../data/provo_stats.csv") %>%
  dplyr::select(text_id, sent_id, trigger_idx, word, freq, surp, len) %>%
  rename(word_idx = trigger_idx)

provo_modeling_df

```

```{r}
# Read in Provo eyetracking data

provo_raw_df = read.csv("../data/provo_eyetracking.csv")

```

```{r}

# unique(provo_raw_df$Participant_ID)
# length(unique(provo_raw_df$Participant_ID))

provo_eyetracking_df = provo_raw_df %>%
  dplyr::select(Participant_ID, Text_ID, Sentence_Number, Word_In_Sentence_Number, Word, Word_Number, IA_FIRST_FIX_PROGRESSIVE, IA_FIRST_RUN_DWELL_TIME, IA_DWELL_TIME, IA_REGRESSION_PATH_DURATION, IA_REGRESSION_OUT, IA_SKIP) %>%
  rename( #first_duration = IA_FIRST_FIXATION_DURATION,   
          gaze_duration = IA_FIRST_RUN_DWELL_TIME,
          total_duration = IA_DWELL_TIME,
          go_past_time = IA_REGRESSION_PATH_DURATION,
          subj = Participant_ID,
          text_id = Text_ID,
          sent_id = Sentence_Number,
          word_idx = Word_In_Sentence_Number,
          word_text_idx = Word_Number,   # IA_ID?
          word = Word,      # Word?
          FPReg = IA_REGRESSION_OUT,
          skip = IA_SKIP,
          ff_progressive = IA_FIRST_FIX_PROGRESSIVE) %>%
  mutate(first_duration = gaze_duration) %>%
  mutate(gaze_duration = if_else(ff_progressive == 0, 0, as.double(gaze_duration)),
         go_past_time = if_else(ff_progressive == 0, 0, as.double(go_past_time))) %>%
  dplyr::select(-ff_progressive) %>%
  
  mutate(
    gaze_duration = if_else(total_duration == 0, 0, as.double(gaze_duration)),
      go_past_time = if_else(total_duration == 0, 0, as.double(go_past_time)),
      FPReg = if_else(total_duration == 0, -1, as.double(FPReg)),
      first_duration =  if_else(total_duration == 0, 0, as.double(first_duration)),
  ) %>%
  
  # drop_na() %>%     # will drop the whole row with all the metrics
  gather(metric, value, 7:12) %>%
  filter(value >= 0) %>%          # filter skipped word in eye tracking data for FPReg
  # ==== Remove skipped words
  # mutate(zero = if_else(metric != "FPReg" & value == 0,T, F)) %>%
  # filter(zero == F) %>%
  
  # mutate(value = if_else(is.na(value), as.integer(0), as.integer(value))) %>%
  # mutate(value = if_else(metric != "FPReg" & is.na(value), as.integer(0), as.integer(value))) %>%
  drop_na() %>%
  mutate(word = str_trim(word)) %>%
  mutate(subj = str_remove(subj, "Sub")) %>%
  mutate(subj = as.integer(subj)) %>%
    group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    mutate(outlier = if_else(metric != "FPReg" & metric != "skip" & value > (mean(value) + 3 * sd(value) ), T, F)) %>%
    filter(outlier == F) %>%
  ungroup() #%>%

# Aggregate cross-participant data for all subjects
provo_eyetracking_agg_df = provo_eyetracking_df %>%
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    summarise(value = mean(value),
              nsubj = length(unique(subj))) %>%
    ungroup()

provo_raw_df %>%
  dplyr::select(IA_REGRESSION_OUT) %>%
  drop_na() %>%
  summarise( m = mean(IA_REGRESSION_OUT))

provo_raw_df %>%
  dplyr::select(IA_SKIP) %>%
  drop_na() %>%
  summarise( m = mean(IA_SKIP))


```

```{r}

# Split the eyetracking data in two by subjects to see how well it correlates with itself
provo_eyetracking_subj1_df_temp = provo_eyetracking_df %>%
  filter(subj <= 42) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    summarise(value = mean(value)) %>%
  ungroup() %>%
  rename(value_1 = value) #%>%
  # dplyr::select(-sent_id, -word_idx)


provo_eyetracking_subj1_df = merge(provo_eyetracking_subj1_df_temp, motr_agg_df, by=c("text_id", "word_text_idx", "metric")) %>%
  arrange(text_id, sent_id, word_idx) %>%
  filter(!(text_id == 13 & word_text_idx >= 20 & word_text_idx <= 52)) %>%
  filter(!(text_id == 3 & word_text_idx >= 46 & word_text_idx <= 57)) %>%
  rename(word = word.y) %>%
  dplyr::select(text_id, word_text_idx, metric, word, value_1)

# View(provo_eyetracking_subj1_df)

provo_eyetracking_subj2_df = provo_eyetracking_df %>%
  filter(subj > 42) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    summarise(value = mean(value)) %>%
  ungroup() %>%
    rename(value_2 = value)%>%
  dplyr::select(-sent_id, -word_idx)

# View(provo_eyetracking_subj2_df)
  
provo_eyetr_grouped_df = merge(provo_eyetracking_subj2_df, provo_eyetracking_subj1_df, by=c("text_id", "word_text_idx", "metric")) %>%
  # filter(word.x == word.y) %>%
  dplyr::select(-word.y) %>%
  group_by(metric) %>%
    mutate(motr_outlier = if_else(metric != "FPReg" & metric != "skip" & value_1 > (mean(value_1) + 3 * sd(value_1) ), T, F)) %>%
    filter(motr_outlier == F) %>%
    mutate(eyetr_outlier = if_else(metric != "FPReg" & metric != "skip" & value_2 > (mean(value_2) + 3 * sd(value_2) ), T, F)) %>%
    filter(eyetr_outlier == F) %>%
  ungroup() %>%
  gather(measure, value, c("value_1", "value_2")) %>%
  dplyr::select(-motr_outlier, -eyetr_outlier)

# View(provo_eyetr_grouped_df)

```


```{r}
provo_df = merge(provo_eyetracking_agg_df, provo_modeling_df, by=c("text_id", "sent_id", "word_idx")) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  arrange(text_id, sent_id, word_idx) %>%
  rename(eyetr_value = value) 

provo_df = merge(provo_df, motr_agg_df, by=c("text_id", "word_text_idx", "metric")) %>%
arrange(text_id, sent_id, word_idx) %>%
  # almost all the word.x != word.y is because of normalization problem, so we can keep them, instead, deleting some special cases
filter(!(text_id == 13 & word_text_idx >= 20 & word_text_idx <= 52)) %>%
  filter(!(text_id == 3 & word_text_idx >= 46 & word_text_idx <= 57)) %>%
# filter(word.x == word) #%>%
dplyr::select(-word.x, -word.y) %>%
group_by(metric) %>%
  mutate(motr_outlier = if_else(metric != "FPReg" & motr_value > (mean(motr_value) + 3 * sd(motr_value) ), T, F)) %>%
  filter(motr_outlier == F) %>%
  mutate(eyetr_outlier = if_else(metric != "FPReg" & eyetr_value > (mean(eyetr_value) + 3 * sd(eyetr_value) ), T, F)) %>%
  filter(eyetr_outlier == F) %>%
ungroup() %>%
gather(measure, value, c("eyetr_value", "motr_value")) %>%
dplyr::select(-motr_outlier, -eyetr_outlier)
  
# provo_df
```


# Bayesian -- use Stan -- motr & eyetr correlation
```{r}
print("Gaze Duration")
gd_df = provo_df %>% filter(metric == "gaze_duration") %>% 
  spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value =  pmax(eyetr_value, 1),
         motr_value = pmax(motr_value, 1)
  ) %>%
  mutate(eyetr_value_log = log(eyetr_value),
         motr_value_log = log(motr_value))
print(cor.test(gd_df$eyetr_value, gd_df$motr_value)$estimate)
print(cor.test(gd_df$eyetr_value_log, gd_df$motr_value_log)$estimate)
# View(gd_df)
```


```{r}
gd_df %>% 
  gather(measure, value, 12:15) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")
  
```



```{r}
# center data around 0.

gd_temp <- gd_df[c("eyetr_value", "motr_value")] %>%
   # mutate(eyetr_value = eyetr_value - mean(eyetr_value),
   #      motr_value = motr_value - mean(motr_value)) %>%
  data.matrix()

gd_temp_log <- gd_df[c("eyetr_value_log", "motr_value_log")] %>%
 mutate(eyetr_value_log = eyetr_value_log - mean(eyetr_value_log), 
        motr_value_log = motr_value_log - mean(motr_value_log)) %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix gd_temp
plot(gd_temp, pch = 16, col = "blue",
     main = "Not Log-Transformed")
# Plot the second data matrix gd_temp_log
plot(gd_temp_log, pch = 16, col = "red",
     main = "Centered Log-Transformed")

```



```{r, eval=FALSE}
gd_data = list(x=gd_temp, N=nrow(gd_temp))

fit_gd = stan(
  file="stan_models/bivariate_correlation.stan", 
  data=gd_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_gd@stanmodel@dso <- new("cxxdso")
saveRDS(fit_gd, file = paste0("./bayesian_models/bayesian_models_correlation/motr_eyetr_gaze_duration_cor_drop0s.rds"))

```

```{r}
print("Go Past Time")
gpt_df = provo_df %>% filter(metric == "go_past_time") %>% 
  spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value =  pmax(eyetr_value, 1),
         motr_value = pmax(motr_value, 1)
  ) %>%
  mutate(eyetr_value_log = log(eyetr_value),
         motr_value_log = log(motr_value))
print(cor.test(gpt_df$eyetr_value, gpt_df$motr_value)$estimate)
print(cor.test(gpt_df$eyetr_value_log, gpt_df$motr_value_log)$estimate)

gpt_df %>% 
  gather(measure, value, 12:15) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

gpt_temp <- gpt_df[c("eyetr_value", "motr_value")] %>% data.matrix()

gpt_temp_log <- gpt_df[c("eyetr_value_log", "motr_value_log")] %>%
 mutate(eyetr_value_log = eyetr_value_log - mean(eyetr_value_log), 
        motr_value_log = motr_value_log - mean(motr_value_log)) %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix gpt_temp
plot(gpt_temp, pch = 16, col = "blue",
     main = "Not Log-Transformed")
# Plot the second data matrix gpt_temp_log
plot(gpt_temp_log, pch = 16, col = "red",
     main = "Centered Log-Transformed")
```

```{r, eval=FALSE}
# -------fit model go past time ----------
gpt_data = list(x=gpt_temp, N=nrow(gpt_temp))
fit_gpt = stan(
  file="stan_models/bivariate_correlation.stan", 
  data=gpt_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_gpt@stanmodel@dso <- new("cxxdso")
saveRDS(fit_gpt, file = paste0("./bayesian_models/bayesian_models_correlation/motr_eyetr_go_past_time_cor_drop0s.rds"))
```


```{r}
print("Total Duration")
td_df = provo_df %>% filter(metric == "total_duration") %>% 
  spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value =  pmax(eyetr_value, 1),
         motr_value = pmax(motr_value, 1)
  ) %>%
  mutate(eyetr_value_log = log(eyetr_value),
         motr_value_log = log(motr_value))
print(cor.test(td_df$eyetr_value, td_df$motr_value)$estimate)
print(cor.test(td_df$eyetr_value_log, td_df$motr_value_log)$estimate)

td_df %>% 
  gather(measure, value, 12:15) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

td_temp <- td_df[c("eyetr_value", "motr_value")] %>% data.matrix()

td_temp_log <- td_df[c("eyetr_value_log", "motr_value_log")] %>%
 mutate(eyetr_value_log = eyetr_value_log - mean(eyetr_value_log), 
        motr_value_log = motr_value_log - mean(motr_value_log)) %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix td_temp
plot(td_temp, pch = 16, col = "blue",
     main = "Not Log-Transformed")
# Plot the second data matrix td_temp_log
plot(td_temp_log, pch = 16, col = "red",
     main = "Centered Log-Transformed")
```

```{r, eval=FALSE}
# -------fit model total duration ----------
td_data = list(x=td_temp, N=nrow(td_temp))
fit_td = stan(
  file="stan_models/bivariate_correlation.stan", 
  data=td_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_td@stanmodel@dso <- new("cxxdso")
saveRDS(fit_td, file = paste0("./bayesian_models/bayesian_models_correlation/motr_eyetr_total_duration_cor_drop0s.rds"))
```

```{r}
print("First Pass Regression Prob.")
reg_df = provo_df %>% filter(metric == "FPReg") %>% 
  spread(measure, value) %>%
  filter(eyetr_value > 0, motr_value > 0) %>%
  mutate(eyetr_value =  pmax(eyetr_value, 1e-5),
         motr_value = pmax(motr_value, 1e-5))
print(cor.test(reg_df$eyetr_value, reg_df$motr_value)$estimate)

# View(reg_df)
reg_df %>% 
  gather(measure, value, 12:13) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

reg_temp <- reg_df[c("eyetr_value", "motr_value")] %>% data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix td_temp
plot(reg_temp, pch = 16, col = "blue",
     main = "Not Log-Transformed")
```

```{r, eval=FALSE}
# -------fit model FPReg ----------
reg_data = list(x=reg_temp, N=nrow(reg_temp))
fit_reg = stan(
  file="stan_models/bivariate_normal_reg.stan", 
  data=reg_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_reg@stanmodel@dso <- new("cxxdso")
saveRDS(fit_reg, file = paste0("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor.rds"))
```



```{r}
# models with all 0s
fit_gd = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_gaze_duration_cor.rds")
fit_gpt = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_go_past_time_cor.rds")
fit_td = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_total_duration_cor.rds")
fit_reg = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor.rds")

# models for drop 0s
# fit_gd = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_gaze_duration_cor_drop0s.rds")
# fit_gpt = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_go_past_time_cor_drop0s.rds")
# fit_td = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_total_duration_cor_drop0s.rds")
# fit_reg = readRDS("./bayesian_models/bayesian_models_correlation/ranked_motr_eyetr_FPReg_cor.rds")

print('---------------------------- Gaze Duration--------------------------------------------')
print(fit_gd)
print('---------------------------- Go Past Time--------------------------------------------')
print(fit_gpt)
print('---------------------------- Total Duration--------------------------------------------')
print(fit_td)
print('---------------------------- First Pass Regression Prob.--------------------------------------------')
print(fit_reg)

```

```{r, eval=FALSE}
# stan_trace(fit_gd, pars=c("rho", "mu", "sigma", "nu"))
# stan_dens(fit_gd, pars=c("rho", "mu", "sigma", "nu"), separate_chains = TRUE)
# stan_plot(fit_gd, pars=c("rho", "mu", "sigma", "nu"))

# Gaze Duration
stan_trace(fit_gd)
stan_dens(fit_gd, separate_chains = TRUE)
stan_plot(fit_gd)

# Go Past Time
stan_trace(fit_gpt)
stan_dens(fit_gpt, separate_chains = TRUE)
stan_plot(fit_gpt)

# Total Duration
stan_trace(fit_td)
stan_dens(fit_td, separate_chains = TRUE)
stan_plot(fit_td)

# FPReg
stan_trace(fit_reg)
stan_dens(fit_reg, separate_chains = TRUE)
stan_plot(fit_reg)

```

```{r, fig.width=5, fig.height=10, eval=FALSE}
p1 <- stan_trace(fit_gd, pars = 'rho', inc_warmup = FALSE)
p2 <- stan_dens(fit_gd, pars = 'rho', separate_chains = TRUE)
p3 <- stan_trace(fit_gd, pars = 'mu[1]', inc_warmup = FALSE)
p4 <- stan_dens(fit_gd, pars = 'mu[1]', separate_chains = TRUE)
p5 <- stan_trace(fit_gd, pars = 'mu[2]', inc_warmup = FALSE)
p6 <- stan_dens(fit_gd, pars = 'mu[2]', separate_chains = TRUE)
p7 <- stan_trace(fit_gd, pars = 'sigma[1]', inc_warmup = FALSE)
p8 <- stan_dens(fit_gd, pars = 'sigma[1]', separate_chains = TRUE)
p9 <- stan_trace(fit_gd, pars = 'sigma[2]', inc_warmup = FALSE)
p10 <- stan_dens(fit_gd, pars = 'sigma[2]', separate_chains = TRUE)
p11 <- stan_trace(fit_gd, pars = 'nu', inc_warmup = FALSE)
p12 <- stan_dens(fit_gd, pars = 'nu', separate_chains = TRUE)


# Use grid.arrange() to arrange the plots
# grid.arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, ncol=2, nrow=6)

```



```{r}
print('---------------------------- Gaze Duration--------------------------------------------')
rho_gd = as.numeric(extract(fit_gd, "rho")[[1]])
mean = mean(rho_gd)
crI = quantile(rho_gd, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_gd), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- Go Past Time--------------------------------------------')
rho_gpt = as.numeric(extract(fit_gpt, "rho")[[1]])
mean = mean(rho_gpt)
crI = quantile(rho_gpt, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_gpt), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- Total Duration--------------------------------------------')
rho_td = as.numeric(extract(fit_td, "rho")[[1]])
mean = mean(rho_td)
crI = quantile(rho_td, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_td), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- First Pass Regression --------------------------------------------')
# rho_reg = as.numeric(extract(fit_reg, "rho[1, 2]")[[1]])
rho_reg = as.numeric(extract(fit_reg, "rho")[[1]])
mean = mean(rho_reg)
crI = quantile(rho_reg, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_reg), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]")
```


```{r, eval=FALSE}
print('---------------------------- Gaze Duration--------------------------------------------')
gd_rand <- extract(fit_gd, "x_rand")[[1]]
# x_rand_filtered <- x_rand[apply(x_rand, 1, function(x) all(x > 0)),]
# x_rand_filtered

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 400), ylim=c(0, 700), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(gd_rand[,1], gd_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(gd_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(gd_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(gd_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- Go Past Time--------------------------------------------')
gpt_rand <- extract(fit_gpt, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 1200), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "Go Past Time") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(gpt_rand[,1], gpt_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(gpt_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(gpt_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(gpt_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- Total Duration--------------------------------------------')
td_rand <- extract(fit_td, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 1200), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "Total Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(td_rand[,1], td_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(td_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(td_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(td_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression --------------------------------------------')
reg_rand <- extract(fit_reg, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(reg_rand[,1], reg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(reg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(reg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")


```

# model motr eyetr FPReg correlation (eyetr < 0.3)
```{r}
print("First Pass Regression Prob. all and  < 0.3")
reg_df_all = provo_df %>% filter(metric == "FPReg") %>% 
  spread(measure, value) %>%
  # filter(eyetr_value > 0, motr_value > 0) %>%
  mutate(eyetr_value =  pmax(eyetr_value, 1e-5),
         motr_value = pmax(motr_value, 1e-5))

reg_df_low_drop0 = provo_df %>% filter(metric == "FPReg") %>% 
  spread(measure, value) %>%
  filter(eyetr_value > 0, motr_value > 0) %>%
  mutate(eyetr_value =  pmax(eyetr_value, 1e-5),
         motr_value = pmax(motr_value, 1e-5)) %>%
  filter(eyetr_value < 0.3)

reg_df_low = provo_df %>% filter(metric == "FPReg") %>% 
  spread(measure, value) %>%
  # filter(eyetr_value > 0, motr_value > 0) %>%
  mutate(eyetr_value =  pmax(eyetr_value, 1e-5),
         motr_value = pmax(motr_value, 1e-5)) %>%
  filter(eyetr_value < 0.3)
  # mutate(eyetr_value = exp(eyetr_value),
         # motr_value = exp(motr_value)
         # )
# View(reg_df)

print(cor.test(reg_df_all$eyetr_value, reg_df_all$motr_value)$estimate)
print(cor.test(reg_df_all$eyetr_value, reg_df_all$motr_value)$p.value)
print(cor.test(reg_df_low$eyetr_value, reg_df_low$motr_value)$estimate)
print(cor.test(reg_df_low$eyetr_value, reg_df_low$motr_value)$p.value)
print(cor.test(reg_df_low_drop0$eyetr_value, reg_df_low_drop0$motr_value)$estimate)
print(cor.test(reg_df_low_drop0$eyetr_value, reg_df_low_drop0$motr_value)$p.value)

# View(reg_df)
reg_df_low %>% 
  gather(measure, value, 12:13) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

reg_temp_all <- reg_df_all[c("eyetr_value", "motr_value")] %>% data.matrix()
reg_temp_low <- reg_df_low[c("eyetr_value", "motr_value")] %>% data.matrix()
reg_temp_low_drop0 <- reg_df_low_drop0[c("eyetr_value", "motr_value")] %>% data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix td_temp
plot(reg_temp_low, pch = 16, col = "blue",
     main = "Not Log-Transformed")
```

```{r, eval=FALSE}
# -------fit model FPReg < 0.3 ----------
reg_data = list(x=reg_temp_all, N=nrow(reg_temp_all))
fit_reg = stan(
  # file="stan_models/bivariate_beta_correlation_reg.stan", 
  file = "stan_models/bivariate_normal_reg.stan",
  data=reg_data, 
  iter=4000, 
  chains=4, 
  cores=4,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_reg@stanmodel@dso <- new("cxxdso")
saveRDS(fit_reg, file = paste0("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_all_data_drop0s.rds"))
```


# model motr eyetr FPReg correlation (eyetr >= 0.3)
```{r}
print("First Pass Regression Prob. >= 0.3")
reg_df_high_drop0 = provo_df %>% filter(metric == "FPReg") %>% 
  spread(measure, value) %>%
  filter(eyetr_value > 0, motr_value > 0) %>%
  mutate(eyetr_value =  pmax(eyetr_value, 1e-5),
         motr_value = pmax(motr_value, 1e-5)) %>%
  filter(eyetr_value >= 0.3)

reg_df_high = provo_df %>% filter(metric == "FPReg") %>% 
  spread(measure, value) %>%
  # filter(eyetr_value > 0, motr_value > 0) %>%
  mutate(eyetr_value =  pmax(eyetr_value, 1e-5),
         motr_value = pmax(motr_value, 1e-5)) %>%
  filter(eyetr_value >= 0.3)
  # mutate(eyetr_value = exp(eyetr_value),
         # motr_value = exp(motr_value)
         # )
# View(reg_df)

print(cor.test(reg_df_high$eyetr_value, reg_df_high$motr_value)$estimate)
print(cor.test(reg_df_high$eyetr_value, reg_df_high$motr_value)$p.value)
print(cor.test(reg_df_high_drop0$eyetr_value, reg_df_high_drop0$motr_value)$estimate)
print(cor.test(reg_df_high_drop0$eyetr_value, reg_df_high_drop0$motr_value)$p.value)

# View(reg_df)
reg_df_high %>% 
  gather(measure, value, 12:13) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

reg_temp_high <- reg_df_high[c("eyetr_value", "motr_value")] %>% data.matrix()
reg_temp_high_drop0 <- reg_df_high_drop0[c("eyetr_value", "motr_value")] %>% data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 3))
# Plot the first data matrix td_temp
plot(reg_temp_all, pch = 16, col = "blue",
     main = "FPReg not logged all data")
plot(reg_temp_low, pch = 16, col = "blue",
     main = "FPReg not logged eyetr < 0.3 ")
plot(reg_temp_high, pch = 16, col = "blue",
     main = "FPReg not logged eyetr >= 0.3")
```

```{r, eval=FALSE}
# -------fit model FPReg >= 0.3 ----------
reg_data = list(x=reg_temp_high_drop0, N=nrow(reg_temp_high_drop0))
fit_reg = stan(
  # file="stan_models/bivariate_beta_correlation_reg.stan", 
  file = "stan_models/bivariate_normal_reg.stan",
  data=reg_data, 
  iter=4000, 
  chains=4, 
  cores=4,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_reg@stanmodel@dso <- new("cxxdso")
saveRDS(fit_reg, file = paste0(".//bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_03-1_drop0s.rds"))
```



```{r}
fit_mreg_all = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_all_data.rds")
fit_mreg_all_drop0 = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_all_data_drop0s.rds")
fit_mreg_low = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_00-03.rds")
fit_mreg_low_drop0 = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_00-03_drop0s.rds")
fit_mreg_high = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_03-1.rds")
fit_mreg_high_drop0 = readRDS("./bayesian_models/bayesian_models_correlation/motr_eyetr_FPReg_cor_03-1_drop0s.rds")

print('---------------------------- First Pass Regression Prob. all data --------------------------------------------')
print(fit_mreg_all)
print('---------------------------- First Pass Regression Prob. all data no 0s--------------------------------------------')
print(fit_mreg_all_drop0)
print('---------------------------- First Pass Regression Prob.< 0.3--------------------------------------------')
print(fit_mreg_low)
print('---------------------------- First Pass Regression Prob.< 0.3 no 0s--------------------------------------------')
print(fit_mreg_low_drop0)
print('---------------------------- First Pass Regression Prob.>= 0.3--------------------------------------------')
print(fit_mreg_high)
print('---------------------------- First Pass Regression Prob.>= 0.3 no 0s--------------------------------------------')
print(fit_mreg_high_drop0)
```

```{r}
# # FPReg all data
# stan_trace(fit_mreg_all)
# stan_dens(fit_mreg_all, separate_chains = TRUE)
# stan_plot(fit_mreg_all)

# stan_trace(fit_mreg_all_drop0)
# stan_dens(fit_mreg_all_drop0, separate_chains = TRUE)
# stan_plot(fit_mreg_all_drop0)

# # FPReg < 0.3
# stan_trace(fit_mreg_low)
# stan_dens(fit_mreg_low, separate_chains = TRUE)
# stan_plot(fit_mreg_low)
# 
# stan_trace(fit_mreg_low_drop0)
# stan_dens(fit_mreg_low_drop0, separate_chains = TRUE)
# stan_plot(fit_mreg_low_drop0)

# FPReg > 0.3
stan_trace(fit_mreg_high)
stan_dens(fit_mreg_high, separate_chains = TRUE)
stan_plot(fit_mreg_high)

stan_trace(fit_mreg_high_drop0)
stan_dens(fit_mreg_high_drop0, separate_chains = TRUE)
stan_plot(fit_mreg_high_drop0)
```

```{r}
print('---------------------------- First Pass Regression all data--------------------------------------------')
rho_mreg_all = as.numeric(extract(fit_mreg_all, "rho")[[1]])
mean = mean(rho_mreg_all)
crI = quantile(rho_mreg_all, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mreg_all), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- First Pass Regression all data no 0s--------------------------------------------')
rho_mreg_all_drop0 = as.numeric(extract(fit_mreg_all_drop0, "rho")[[1]])
mean = mean(rho_mreg_all_drop0)
crI = quantile(rho_mreg_all_drop0, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mreg_all_drop0), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- First Pass Regression < 0.3--------------------------------------------')
rho_mreg_low = as.numeric(extract(fit_mreg_low, "rho")[[1]])
mean = mean(rho_mreg_low)
crI = quantile(rho_mreg_low, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mreg_low), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- First Pass Regression < 0.3 no 0s--------------------------------------------')
rho_mreg_low_drop0 = as.numeric(extract(fit_mreg_low_drop0, "rho")[[1]])
mean = mean(rho_mreg_low_drop0)
crI = quantile(rho_mreg_low_drop0, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mreg_low_drop0), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- First Pass Regression >= 0.3--------------------------------------------')
rho_mreg_high = as.numeric(extract(fit_mreg_high, "rho")[[1]])
mean = mean(rho_mreg_high)
crI = quantile(rho_mreg_high, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mreg_high), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- First Pass Regression >= 0.3 no 0s--------------------------------------------')
rho_mreg_high_drop0 = as.numeric(extract(fit_mreg_high_drop0, "rho")[[1]])
mean = mean(rho_mreg_high_drop0)
crI = quantile(rho_mreg_high_drop0, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mreg_high_drop0), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
```

```{r, eval=FALSE}
print('---------------------------- First Pass Regression all data --------------------------------------------')
mallreg_rand <- extract(fit_mreg_all, "x_rand")[[1]]
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mallreg_rand[,1], mallreg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp_all, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mallreg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mallreg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression all data no 0s--------------------------------------------')
mallreg_rand_drop0 <- extract(fit_mreg_all_drop0, "x_rand")[[1]]
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mallreg_rand_drop0[,1], mallreg_rand_drop0[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp_all, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mallreg_rand_drop0, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mallreg_rand_drop0, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression < 0.3 --------------------------------------------')
mlowreg_rand <- extract(fit_mreg_low, "x_rand")[[1]]
print(mlowreg_rand)
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mlowreg_rand[,1], mlowreg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp_low, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mlowreg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mlowreg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression < 0.3 no 0s --------------------------------------------')
mlowreg_rand_drop0 <- extract(fit_mreg_low_drop0, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mlowreg_rand_drop0[,1], mlowreg_rand_drop0[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp_low_drop0, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mlowreg_rand_drop0, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mlowreg_rand_drop0, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression >= 0.3 --------------------------------------------')
mhighreg_rand_samples <- extract(fit_mreg_high, "x_rand")[[1]]
# print(mhighreg_rand_samples)
selected_indices <- sample(1:nrow(mhighreg_rand_samples), 900)
mhighreg_rand <- mhighreg_rand_samples[selected_indices, ]
# mhighreg_rand <- extract(fit_mreg_high, "x_rand")[[1]]
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mhighreg_rand[,1], mhighreg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp_high, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mhighreg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mhighreg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

# print('---------------------------- First Pass Regression >= 0.3 no 0s --------------------------------------------')
mhighreg_rand_drop0_samples <- extract(fit_mreg_high_drop0, "x_rand")[[1]]
selected_indices <- sample(1:nrow(mhighreg_rand_drop0_samples), 900)
mhighreg_rand_drop0 <- mhighreg_rand_drop0_samples[selected_indices, ]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color
points(mhighreg_rand_drop0[,1], mhighreg_rand_drop0[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(reg_temp_high_drop0, pch=16, col="red")

# add dataEllipse with color
dataEllipse(mhighreg_rand_drop0, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mhighreg_rand_drop0, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

```


# model eye tracking to eye tracking correlation
```{r}

print("Gaze Duration")
# View(provo_eyetr_grouped_df)

egd_df = provo_eyetr_grouped_df %>% filter(metric == "gaze_duration") %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1),
         eyetr_value_2 = pmax(value_2, 1)
  ) 
print(cor.test(egd_df$eyetr_value_1, egd_df$eyetr_value_2)$estimate)

# View(egd_df)

egd_df %>% 
  gather(measure, value, 5:6) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

egd_temp <- egd_df[c("eyetr_value_1", "eyetr_value_2")] %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix gd_temp
plot(egd_temp, pch = 16, col = "blue",
     main = "Not Log-Transformed")

```

```{r, eval=FALSE}
egd_data = list(x=egd_temp, N=nrow(egd_temp))

fit_egd = stan(
  file="stan_models/bivariate_correlation.stan", 
  data=egd_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_egd@stanmodel@dso <- new("cxxdso")
saveRDS(fit_egd, file = paste0("./bayesian_models/bayesian_models_correlation/cor_bivariate/eyetr_eyetr_gaze_duration_cor.rds"))

```

```{r}
print("Go Past Time")

egpt_df = provo_eyetr_grouped_df %>% filter(metric == "go_past_time") %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1),
         eyetr_value_2 = pmax(value_2, 1)
  ) 
print(cor.test(egpt_df$eyetr_value_1, egpt_df$eyetr_value_2)$estimate)

# View(egd_df)

egpt_df %>% 
  gather(measure, value, 5:6) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

egpt_temp <- egpt_df[c("eyetr_value_1", "eyetr_value_2")] %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix gd_temp
plot(egpt_temp, pch = 16, col = "blue",
     main = "Not Log-Transformed")
```

```{r, eval=FALSE}
egpt_data = list(x=egpt_temp, N=nrow(egpt_temp))

fit_egpt = stan(
  file="stan_models/bivariate_correlation.stan", 
  data=egpt_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_egpt@stanmodel@dso <- new("cxxdso")
saveRDS(fit_egpt, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_go_past_time_cor.rds"))

```


```{r}
print("Total Duration")

etd_df = provo_eyetr_grouped_df %>% filter(metric == "total_duration") %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1),
         eyetr_value_2 = pmax(value_2, 1)
  ) 
print(cor.test(etd_df$eyetr_value_1, etd_df$eyetr_value_2)$estimate)

# View(egd_df)

etd_df %>% 
  gather(measure, value, 5:6) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

etd_temp <- etd_df[c("eyetr_value_1", "eyetr_value_2")] %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix gd_temp
plot(etd_temp, pch = 16, col = "blue",
     main = "Total Duration Not Log-Transformed")
```


```{r, eval=FALSE}
etd_data = list(x=etd_temp, N=nrow(etd_temp))

fit_etd = stan(
  file="stan_models/bivariate_correlation.stan", 
  data=etd_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_etd@stanmodel@dso <- new("cxxdso")
saveRDS(fit_etd, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_total_duration_cor.rds"))
```


```{r}
print("Fisrt Pass Regression Prob.")

ereg_df = provo_eyetr_grouped_df %>% filter(metric == "FPReg") %>% distinct() %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value)) %>%
  filter(!(row_number() %in% c(443, 444, 445, 446))) %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1e-5),
         eyetr_value_2 = pmax(value_2, 1e-5)
  )
print(cor.test(ereg_df$eyetr_value_1, ereg_df$eyetr_value_2)$estimate)

# View(egd_df)

ereg_df %>% 
  gather(measure, value, 5:6) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

ereg_temp <- ereg_df[c("eyetr_value_1", "eyetr_value_2")] %>%
  drop_na() %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix gd_temp
plot(ereg_temp, pch = 16, col = "blue",
     main = "FPReg Not Log-Transformed")
```

```{r, eval=FALSE}
# -------fit model FPReg ----------

# View(ereg_temp)
ereg_data = list(x=ereg_temp, N=nrow(ereg_temp))
fit_ereg = stan(
  file="stan_models/bivariate_normal_reg.stan", 
  data=ereg_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99),
  verbose = FALSE
  )

# Save the model 
fit_ereg@stanmodel@dso <- new("cxxdso")
saveRDS(fit_ereg, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_FPReg_cor5.rds"))
```


```{r}
fit_egd = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_gaze_duration_cor.rds")
fit_egpt = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_go_past_time_cor.rds")
fit_etd = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_total_duration_cor.rds")
fit_ereg = readRDS("./bayesian_models/bayesian_models_correlation/ranked_eyetr_eyetr_FPReg_cor.rds")
print('---------------------------- Gaze Duration--------------------------------------------')
print(fit_egd)
print('---------------------------- Go Past Time--------------------------------------------')
print(fit_egpt)
print('---------------------------- Total Duration--------------------------------------------')
print(fit_etd)
print('---------------------------- First Pass Regression Prob.--------------------------------------------')
print(fit_ereg)
```


```{r, eval=FALSE}
# stan_trace(fit_egd, pars=c("rho", "mu", "sigma", "nu"))
# stan_dens(fit_egd, pars=c("rho", "mu", "sigma", "nu"), separate_chains = TRUE)
# stan_plot(fit_egd, pars=c("rho", "mu", "sigma", "nu"))

# Gaze Duration
stan_trace(fit_egd)
stan_dens(fit_egd, separate_chains = TRUE)
stan_plot(fit_egd)

# Go Past Time
stan_trace(fit_egpt)
stan_dens(fit_egpt, separate_chains = TRUE)
stan_plot(fit_egpt)

# Total Duration
stan_trace(fit_etd)
stan_dens(fit_etd, separate_chains = TRUE)
stan_plot(fit_etd)

# FPReg
stan_trace(fit_ereg)
stan_dens(fit_ereg, separate_chains = TRUE)
stan_plot(fit_ereg)
```


```{r}
print('---------------------------- Gaze Duration--------------------------------------------')
rho_egd = as.numeric(extract(fit_egd, "rho")[[1]])
mean = mean(rho_egd)
crI = quantile(rho_egd, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_egd), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- Go Past Time--------------------------------------------')
rho_egpt = as.numeric(extract(fit_egpt, "rho")[[1]])
mean = mean(rho_egpt)
crI = quantile(rho_egpt, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_egpt), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- Total Duration--------------------------------------------')
rho_etd = as.numeric(extract(fit_etd, "rho")[[1]])
mean = mean(rho_etd)
crI = quantile(rho_etd, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_etd), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- First Pass Regression --------------------------------------------')
rho_ereg = as.numeric(extract(fit_ereg, "rho")[[1]])
mean = mean(rho_ereg)
crI = quantile(rho_ereg, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_ereg), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]")
```


```{r, eval=FALSE}
print('---------------------------- Gaze Duration--------------------------------------------')
egd_rand <- extract(fit_egd, "x_rand")[[1]]
# x_rand_filtered <- x_rand[apply(x_rand, 1, function(x) all(x > 0)),]
# x_rand_filtered

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 400), ylim=c(0, 700), type="n",
     xlab = "Eye tracking value 1", ylab = "Eye tracking value 2", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(egd_rand[,1], egd_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(egd_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(egd_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(egd_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- Go Past Time--------------------------------------------')
egpt_rand <- extract(fit_egpt, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 1200), type="n",
     xlab = "Eye tracking value 1", ylab = "Eye tracking value 2", main = "Go Past Time") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(egpt_rand[,1], egpt_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(egpt_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(egpt_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(egpt_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- Total Duration--------------------------------------------')
etd_rand <- extract(fit_etd, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 1200), type="n",
     xlab = "Eye tracking value 1", ylab = "Eye tracking value 2", main = "Total Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(etd_rand[,1], etd_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(etd_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(etd_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(etd_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression --------------------------------------------')
ereg_rand <- extract(fit_ereg, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value 1", ylab = "Eye tracking value 2", main = "First Pass Regression") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(ereg_rand[,1], ereg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(ereg_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(ereg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(ereg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")
```


# Bayesian -- correlation between MoTR and word level statistics
```{r}
print("Log Frequency")
stats_cor_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(stats_cor_df$motr_value, stats_cor_df$freq)$estimate)
print(cor.test(stats_cor_df$eyetr_value, stats_cor_df$freq)$estimate)

# View(stats_cor_df)
stats_cor_df %>% 
  gather(measure, value, c(7, 13)) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

mfreq_temp <- stats_cor_df[c("motr_value", "freq")] %>%
  data.matrix()
efreq_temp <- stats_cor_df[c("eyetr_value", "freq")] %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix gd_temp
plot(mfreq_temp, pch = 16, col = "blue",
     main = "MoTR RTs and Word Frequency")

# Plot the first data matrix gd_temp
plot(efreq_temp, pch = 16, col = "blue",
     main = "EyeTR RTs and Word Frequency")

```
# motr & frequency
```{r, eval=FALSE}
mfreq_data = list(x=mfreq_temp, N=nrow(mfreq_temp))

fit_mfreq = stan(
  file="stan_models/stats_correlation.stan", 
  data=mfreq_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_mfreq@stanmodel@dso <- new("cxxdso")
saveRDS(fit_mfreq, file = paste0("./bayesian_models/bayesian_models_correlation/motr_freq_cor.rds"))
```

# eyetr & frequency
```{r, eval=FALSE}
efreq_data = list(x=efreq_temp, N=nrow(efreq_temp))

fit_efreq = stan(
  file="stan_models/stats_correlation.stan", 
  data=efreq_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_efreq@stanmodel@dso <- new("cxxdso")
saveRDS(fit_efreq, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_freq_cor.rds"))
```

```{r}
print("Length")
stats_cor_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(stats_cor_df$motr_value, stats_cor_df$len)$estimate)
print(cor.test(stats_cor_df$eyetr_value, stats_cor_df$len)$estimate)

# View(stats_cor_df)
stats_cor_df %>% 
  gather(measure, value, c(9, 13)) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

mlen_temp <- stats_cor_df[c("motr_value", "len")] %>%
  data.matrix()
elen_temp <- stats_cor_df[c("eyetr_value", "len")] %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix gd_temp
plot(mlen_temp, pch = 16, col = "blue",
     main = "MoTR RTs and Word Length")

# Plot the first data matrix gd_temp
plot(elen_temp, pch = 16, col = "blue",
     main = "EyeTR RTs and Word Length")
```

# motr & length
```{r, eval=FALSE}
mlen_data = list(x=mlen_temp, N=nrow(mlen_temp))

fit_mlen = stan(
  file="stan_models/stats_correlation_len_normal.stan", 
  data=mlen_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_mlen@stanmodel@dso <- new("cxxdso")
saveRDS(fit_mlen, file = paste0("./bayesian_models/bayesian_models_correlation/motr_len_cor.rds"))

```

# eyetr & length
```{r, eval=FALSE}
elen_data = list(x=elen_temp, N=nrow(elen_temp))

fit_elen = stan(
  file="stan_models/stats_correlation_len_normal.stan", 
  data=elen_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_elen@stanmodel@dso <- new("cxxdso")
saveRDS(fit_elen, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_len_cor.rds"))
```


```{r}
print("Surprisal")
stats_cor_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(stats_cor_df$motr_value, stats_cor_df$surp)$estimate)
print(cor.test(stats_cor_df$eyetr_value, stats_cor_df$surp)$estimate)

# View(stats_cor_df)
stats_cor_df %>% 
  gather(measure, value, c(8, 13)) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

msurp_temp <- stats_cor_df[c("motr_value", "surp")] %>%
  data.matrix()
esurp_temp <- stats_cor_df[c("eyetr_value", "surp")] %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 2))
# Plot the first data matrix gd_temp
plot(msurp_temp, pch = 16, col = "blue",
     main = "MoTR RTs and Surprisal")

# Plot the first data matrix gd_temp
plot(esurp_temp, pch = 16, col = "blue",
     main = "EyeTR RTs and Surprisal")
```
# motr & surprisal
```{r, eval=FALSE}
msurp_data = list(x=msurp_temp, N=nrow(msurp_temp))

fit_msurp = stan(
  file="stan_models/stats_correlation.stan", 
  data=msurp_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_msurp@stanmodel@dso <- new("cxxdso")
saveRDS(fit_msurp, file = paste0("./bayesian_models/bayesian_models_correlation/motr_surp_cor.rds"))

```

# eyetr & surprisal
```{r, eval=FALSE}
esurp_data = list(x=esurp_temp, N=nrow(esurp_temp))

fit_esurp = stan(
  file="stan_models/stats_correlation.stan", 
  data=esurp_data, 
  iter=4000, 
  chains=4, 
  cores=8,
  seed=444,
  # control=list(adapt_delta=0.99), 
  # verbose = FALSE
  )

# Save the model 
fit_esurp@stanmodel@dso <- new("cxxdso")
saveRDS(fit_esurp, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_surp_cor.rds"))

```


```{r}
fit_mfreq = readRDS("./bayesian_models/bayesian_models_correlation/motr_freq_cor.rds")
fit_efreq = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_freq_cor.rds")
fit_mlen = readRDS("./bayesian_models/bayesian_models_correlation/motr_len_cor.rds")
fit_elen = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_len_cor.rds")
fit_msurp = readRDS("./bayesian_models/bayesian_models_correlation/motr_surp_cor.rds")
fit_esurp = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_surp_cor.rds")

print('---------------------------- MoTR & Log Frequency --------------------------------------------')
print(fit_mfreq)
print('---------------------------- EyeTR & Log Frequency --------------------------------------------')
print(fit_efreq)
print('---------------------------- MoTR & Length --------------------------------------------')
print(fit_mlen)
print('---------------------------- EyeTR & Length --------------------------------------------')
print(fit_elen)
print('---------------------------- MoTR & Surprisal --------------------------------------------')
print(fit_msurp)
print('---------------------------- EyeTR & Surprisal --------------------------------------------')
print(fit_esurp)

```

```{r, eval=FALSE}
# MoTR & Log Freq
stan_trace(fit_mfreq)
stan_dens(fit_mfreq, separate_chains = TRUE)
stan_plot(fit_mfreq)

# EyeTR & Log Freq
stan_trace(fit_efreq)
stan_dens(fit_efreq, separate_chains = TRUE)
stan_plot(fit_efreq)

# MoTR & Len
stan_trace(fit_mlen)
stan_dens(fit_mlen, separate_chains = TRUE)
stan_plot(fit_mlen)

# EyeTR & Len
stan_trace(fit_elen)
stan_dens(fit_elen, separate_chains = TRUE)
stan_plot(fit_elen)

# MoTR & Surprisal
stan_trace(fit_msurp)
stan_dens(fit_msurp, separate_chains = TRUE)
stan_plot(fit_msurp)

# EyeTR & Surprisal
stan_trace(fit_esurp)
stan_dens(fit_esurp, separate_chains = TRUE)
stan_plot(fit_esurp)

```


```{r}
print('---------------------------- MoTR & Log Freq --------------------------------------------')
rho_mfreq = as.numeric(extract(fit_mfreq, "rho")[[1]])
mean = mean(rho_mfreq)
crI = quantile(rho_mfreq, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mfreq), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- EyeTR & Log Freq --------------------------------------------')
rho_efreq = as.numeric(extract(fit_efreq, "rho")[[1]])
mean = mean(rho_efreq)
crI = quantile(rho_efreq, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_efreq), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- MoTR & Length --------------------------------------------')
rho_mlen = as.numeric(extract(fit_mlen, "rho")[[1]])
mean = mean(rho_mlen)
crI = quantile(rho_mlen, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_mlen), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- EyeTR & Length --------------------------------------------')
rho_elen = as.numeric(extract(fit_elen, "rho")[[1]])
mean = mean(rho_elen)
crI = quantile(rho_elen, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_elen), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- MoTR & Surprisal --------------------------------------------')
rho_msurp = as.numeric(extract(fit_msurp, "rho")[[1]])
mean = mean(rho_msurp)
crI = quantile(rho_msurp, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_msurp), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")

print('---------------------------- EyeTR & Surprisal --------------------------------------------')
rho_esurp = as.numeric(extract(fit_esurp, "rho")[[1]])
mean = mean(rho_esurp)
crI = quantile(rho_esurp, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_esurp), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]")

```



```{r,eval=FALSE}
print('---------------------------- MoTR & Log Frequency--------------------------------------------')
mfreq_rand <- extract(fit_mfreq, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 12), type="n",
     xlab = "MoTR value", ylab = "Log Frequency", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mfreq_rand[,1], mfreq_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(mfreq_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mfreq_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mfreq_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- EyeTR & Log Frequency--------------------------------------------')
efreq_rand <- extract(fit_efreq, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 500), ylim=c(0, 12), type="n",
     xlab = "Eye tracking value", ylab = "Log Frequency", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(efreq_rand[,1], efreq_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(efreq_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(efreq_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(efreq_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- MoTR & Length --------------------------------------------')
mlen_rand <- extract(fit_mlen, "x_rand")[[1]]
# mlen_rand

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 20), type="n",
     xlab = "MoTR value", ylab = "Word Length", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(mlen_rand[,1], mlen_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(mlen_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(mlen_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(mlen_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- EyeTR & Length --------------------------------------------')
elen_rand <- extract(fit_elen, "x_rand")[[1]]
# elen_rand

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 20), type="n",
     xlab = "EyeTR value", ylab = "Word Length", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(elen_rand[,1], elen_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(elen_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(elen_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(elen_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- MoTR & Surprisal --------------------------------------------')
msurp_rand <- extract(fit_msurp, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 20), type="n",
     xlab = "MoTR value", ylab = "Word Surprisal", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(msurp_rand [,1], msurp_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(msurp_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(msurp_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(msurp_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- EyeTR & Surprisal --------------------------------------------')
esurp_rand <- extract(fit_esurp, "x_rand")[[1]]

# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 800), ylim=c(0, 20), type="n",
     xlab = "EyeTR value", ylab = "Word Surprisal", main = "Gaze Duration") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(esurp_rand [,1], esurp_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(esurp_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(esurp_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(esurp_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

```


```{r}
print("EyeTR vs. EyeTR Fisrt Pass Regression Prob. < 0.3 ")

ereg_df = provo_eyetr_grouped_df %>% filter(metric == "FPReg") %>% distinct() %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value)) %>%
  filter(!(row_number() %in% c(443, 444, 445, 446))) %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1e-5),
         eyetr_value_2 = pmax(value_2, 1e-5))

ereg_df_low = provo_eyetr_grouped_df %>% filter(metric == "FPReg") %>% distinct() %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value)) %>%
  filter(!(row_number() %in% c(443, 444, 445, 446))) %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1e-5),
         eyetr_value_2 = pmax(value_2, 1e-5)) %>%
  filter(eyetr_value_1 < 0.3)
# View(ereg_df_low)

ereg_df_high = provo_eyetr_grouped_df %>% filter(metric == "FPReg") %>% distinct() %>% #group_by(text_id, metric, measure) %>%
  # summarize(value = mean(value)) %>%
  filter(!(row_number() %in% c(443, 444, 445, 446))) %>%
    spread(measure, value) %>%
  # smoothing, if includes 0s
  mutate(eyetr_value_1 =  pmax(value_1, 1e-5),
         eyetr_value_2 = pmax(value_2, 1e-5)) %>%
  filter(eyetr_value_1 >= 0.3)
# View(ereg_df_high) 

print(cor.test(ereg_df$eyetr_value_1, ereg_df$eyetr_value_2)$estimate)
print(cor.test(ereg_df$eyetr_value_1, ereg_df$eyetr_value_2)$p.value)
print(cor.test(ereg_df_low$eyetr_value_1, ereg_df_low$eyetr_value_2)$estimate)
print(cor.test(ereg_df_low$eyetr_value_1, ereg_df_low$eyetr_value_2)$p.value)
print(cor.test(ereg_df_high$eyetr_value_1, ereg_df_high$eyetr_value_2)$estimate)
print(cor.test(ereg_df_high$eyetr_value_1, ereg_df_high$eyetr_value_2)$p.value)

# View(egd_df)

ereg_df %>% 
  gather(measure, value, 5:6) %>%
  ggplot(aes(x = value)) +
  geom_density() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

ereg_temp <- ereg_df[c("eyetr_value_1", "eyetr_value_2")] %>%
  drop_na() %>%
  data.matrix()
ereg_temp_low <- ereg_df_low[c("eyetr_value_1", "eyetr_value_2")] %>%
  drop_na() %>%
  data.matrix()
ereg_temp_high <- ereg_df_high[c("eyetr_value_1", "eyetr_value_2")] %>%
  drop_na() %>%
  data.matrix()

# Set up the plotting area with two side-by-side plots
par(mfrow = c(1, 3))
# Plot the first data matrix gd_temp
plot(ereg_temp, pch = 16, col = "blue",
     main = "FPReg all data Not Log-Transformed")
plot(ereg_temp_low, pch = 16, col = "blue",
     main = "FPReg < 0.3 Not Log-Transformed")
plot(ereg_temp_high, pch = 16, col = "blue",
     main = "FPReg > 0.3 Not Log-Transformed")
```


```{r, eval=FALSE}
# -------fit model eyetr vs. eyetr FPReg <0.3 & >=0.3 ----------
reg_data = list(x=ereg_temp, N=nrow(ereg_temp))
fit_reg = stan(
  # file="stan_models/bivariate_beta_correlation_reg.stan", 
  file = "stan_models/bivariate_normal_reg.stan",
  data=reg_data, 
  iter=4000, 
  chains=4, 
  cores=4,
  seed=444,
  # control=list(adapt_delta=0.99), 
  verbose = FALSE
  )

# Save the model 
fit_reg@stanmodel@dso <- new("cxxdso")
saveRDS(fit_reg, file = paste0("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_FPReg_cor_all_data.rds"))
```



# exploratory: divide eye tracking regression data into two parts.
```{r}
# fit_ereg_all = readRDS("./eyetr_eyetr_FPReg_cor_all_data.rds")
fit_ereg_all = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_FPReg_cor.rds")
fit_ereg_low = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_FPReg_cor_00-03.rds")
fit_ereg_high = readRDS("./bayesian_models/bayesian_models_correlation/eyetr_eyetr_FPReg_cor_03-1.rds")

print('---------------------------- First Pass Regression Prob. all data --------------------------------------------')
print(fit_ereg_all)
print('---------------------------- First Pass Regression Prob.< 0.3--------------------------------------------')
print(fit_ereg_low)
print('---------------------------- First Pass Regression Prob.>= 0.3--------------------------------------------')
print(fit_ereg_high)

```


```{r}
# # FPReg all data
stan_trace(fit_ereg_all)
stan_dens(fit_ereg_all, separate_chains = TRUE)
stan_plot(fit_ereg_all)

# # FPReg < 0.3
stan_trace(fit_ereg_low)
stan_dens(fit_ereg_low, separate_chains = TRUE)
stan_plot(fit_ereg_low)

# FPReg >= 0.3
stan_trace(fit_ereg_high)
stan_dens(fit_ereg_high, separate_chains = TRUE)
stan_plot(fit_ereg_high)
```

```{r}
print('---------------------------- First Pass Regression all data--------------------------------------------')
rho_ereg_all = as.numeric(extract(fit_ereg_all, "rho")[[1]])
mean = mean(rho_ereg_all)
crI = quantile(rho_ereg_all, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_ereg_all), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")


print('---------------------------- First Pass Regression < 0.3--------------------------------------------')
rho_ereg_low = as.numeric(extract(fit_ereg_low, "rho")[[1]])
mean = mean(rho_ereg_low)
crI = quantile(rho_ereg_low, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_ereg_low), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")


print('---------------------------- First Pass Regression >= 0.3--------------------------------------------')
rho_ereg_high = as.numeric(extract(fit_ereg_high, "rho")[[1]])
mean = mean(rho_ereg_high)
crI = quantile(rho_ereg_high, c(.025, .975))
hpd99 = HPDinterval(as.mcmc(rho_ereg_high), prob=0.95)
cat("Mean: ", mean, "\nHPD: [", hpd99[,"lower"], ", ", hpd99[,"upper"], "]", sep="", "\ncrI: [", crI[1], ", ", crI[2], "]\n")
```

```{r}
print('---------------------------- First Pass Regression all data --------------------------------------------')
eallreg_rand <- extract(fit_ereg_all, "x_rand")[[1]]
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(eallreg_rand[,1], eallreg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(ereg_temp, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(eallreg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(eallreg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression < 0.3 --------------------------------------------')
elowreg_rand <- extract(fit_ereg_low, "x_rand")[[1]]
# print(elowreg_rand)
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(elowreg_rand[,1], elowreg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(ereg_temp_low, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(elowreg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(elowreg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

print('---------------------------- First Pass Regression >= 0.3 --------------------------------------------')
ehighreg_rand_samples <- extract(fit_ereg_high, "x_rand")[[1]]
# print(mhighreg_rand_samples)
selected_indices <- sample(1:nrow(ehighreg_rand_samples), 900)
ehighreg_rand <- ehighreg_rand_samples[selected_indices, ]
# mhighreg_rand <- extract(fit_mreg_high, "x_rand")[[1]]
# create a blank plot first with appropriate limits
plot(1, 1, xlim=c(0, 1), ylim=c(0, 1), type="n",
     xlab = "Eye tracking value", ylab = "MoTR value", main = "FPReg") # 'type = "n"' makes sure the plot is blank

# add points for x_rand with color 
points(ehighreg_rand[,1], ehighreg_rand[,2], col = "black", pch = 16)
# add points for gd_temp with color red
points(ereg_temp_high, pch=16, col="red")

# add dataEllipse with color 
dataEllipse(ehighreg_rand, levels = c(0.5, 0.75), fill=T, plot.points = F, col="orange")
dataEllipse(ehighreg_rand, levels = c(0.95, 0.99), fill=T, plot.points = F, col="blue")

```
